diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
78 files changed, 12009 insertions, 10814 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 543af8e..a21f8c7 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -14,7 +14,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineValueType.h" -#include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -246,7 +245,7 @@ protected: assert(VT == MVT::i32 || VT == MVT::i64); MCInst Inst; Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(getX86SubSuperRegister(Reg, VT))); + Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT))); Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } @@ -262,13 +261,13 @@ protected: MCContext &Ctx, int64_t *Residue); bool is64BitMode() const { - return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + return STI.getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { - return (STI.getFeatureBits() & X86::Mode32Bit) != 0; + return STI.getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { - return (STI.getFeatureBits() & X86::Mode16Bit) != 0; + return STI.getFeatureBits()[X86::Mode16Bit]; } unsigned getPointerWidth() { @@ -614,7 +613,7 @@ private: Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); @@ -643,7 +642,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); + Inst.addOperand(MCOperand::createReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, @@ -654,7 +653,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( EmitInstruction( Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); @@ -726,10 +725,10 @@ void X86AddressSanitizer32::InstrumentMemOperandLarge( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::CreateImm(0)); + Inst.addOperand(MCOperand::createImm(0)); EmitInstruction(Out, Inst); } - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); @@ -743,7 +742,7 @@ void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, StoreFlags(Out); // No need to test when ECX is equals to zero. - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction( Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); @@ -884,7 +883,7 @@ private: RegCtx.AddressReg(MVT::i64))); } const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); @@ -914,7 +913,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); + Inst.addOperand(MCOperand::createReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, @@ -925,7 +924,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( EmitInstruction( Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); @@ -997,11 +996,11 @@ void X86AddressSanitizer64::InstrumentMemOperandLarge( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); - Inst.addOperand(MCOperand::CreateImm(0)); + Inst.addOperand(MCOperand::createImm(0)); EmitInstruction(Out, Inst); } - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); @@ -1015,7 +1014,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, StoreFlags(Out); // No need to test when RCX is equals to zero. - MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + MCSymbol *DoneSym = Ctx.createTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); EmitInstruction( Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); @@ -1073,9 +1072,9 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, const bool hasCompilerRTSupport = T.isOSLinux(); if (ClAsanInstrumentAssembly && hasCompilerRTSupport && MCOptions.SanitizeAddress) { - if ((STI.getFeatureBits() & X86::Mode32Bit) != 0) + if (STI.getFeatureBits()[X86::Mode32Bit] != 0) return new X86AddressSanitizer32(STI); - if ((STI.getFeatureBits() & X86::Mode64Bit) != 0) + if (STI.getFeatureBits()[X86::Mode64Bit] != 0) return new X86AddressSanitizer64(STI); } return new X86AsmInstrumentation(STI); diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index d743aa6..3047fd1 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,6 +11,7 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" +#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -664,6 +665,7 @@ private: ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); std::unique_ptr<X86Operand> ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); + std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, SMLoc Start, @@ -727,23 +729,24 @@ private: bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? - return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + return STI.getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { // FIXME: Can tablegen auto-generate this? - return (STI.getFeatureBits() & X86::Mode32Bit) != 0; + return STI.getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { // FIXME: Can tablegen auto-generate this? - return (STI.getFeatureBits() & X86::Mode16Bit) != 0; + return STI.getFeatureBits()[X86::Mode16Bit]; } - void SwitchMode(uint64_t mode) { - uint64_t oldMode = STI.getFeatureBits() & - (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit); - unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(oldMode | mode)); + void SwitchMode(unsigned mode) { + FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); + FeatureBitset OldMode = STI.getFeatureBits() & AllModes; + unsigned FB = ComputeAvailableFeatures( + STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); - assert(mode == (STI.getFeatureBits() & - (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit))); + + assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes)); } unsigned getPointerWidth() { @@ -1189,7 +1192,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { StringRef IDVal = getTok().getString(); if (IDVal == "f" || IDVal == "b") { MCSymbol *Sym = - getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b"); + getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; const MCExpr *Val = MCSymbolRefExpr::Create(Sym, Variant, getContext()); @@ -1349,7 +1352,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, } // Create the symbol reference. - MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); + MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); return false; @@ -1407,6 +1410,43 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, /*Scale=*/1, Start, End, Size, Identifier, Info); } +//ParseRoundingModeOp - Parse AVX-512 rounding mode operand +std::unique_ptr<X86Operand> +X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + // Eat "{" and mark the current place. + const SMLoc consumedToken = consumeToken(); + if (Tok.getIdentifier().startswith("r")){ + int rndMode = StringSwitch<int>(Tok.getIdentifier()) + .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) + .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF) + .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF) + .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) + .Default(-1); + if (-1 == rndMode) + return ErrorOperand(Tok.getLoc(), "Invalid rounding mode."); + Parser.Lex(); // Eat "r*" of r*-sae + if (!getLexer().is(AsmToken::Minus)) + return ErrorOperand(Tok.getLoc(), "Expected - at this point"); + Parser.Lex(); // Eat "-" + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + const MCExpr *RndModeOp = + MCConstantExpr::Create(rndMode, Parser.getContext()); + return X86Operand::CreateImm(RndModeOp, Start, End); + } + if(Tok.getIdentifier().equals("sae")){ + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + return X86Operand::CreateToken("{sae}", consumedToken); + } + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); +} /// ParseIntelMemOperand - Parse intel style memory operand. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, @@ -1656,6 +1696,11 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelMemOperand(Imm, Start, Size); } + // rounding mode token + if (STI.getFeatureBits()[X86::FeatureAVX512] && + getLexer().is(AsmToken::LCurly)) + return ParseRoundingModeOp(Start, End); + // Register. unsigned RegNo = 0; if (!ParseRegister(RegNo, Start, End)) { @@ -1708,13 +1753,19 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { return nullptr; return X86Operand::CreateImm(Val, Start, End); } + case AsmToken::LCurly:{ + SMLoc Start = Parser.getTok().getLoc(), End; + if (STI.getFeatureBits()[X86::FeatureAVX512]) + return ParseRoundingModeOp(Start, End); + return ErrorOperand(Start, "unknown token in expression"); + } } } bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { MCAsmParser &Parser = getParser(); - if(STI.getFeatureBits() & X86::FeatureAVX512) { + if(STI.getFeatureBits()[X86::FeatureAVX512]) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); @@ -1964,14 +2015,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName = PatchedName.substr(0, Name.size()-1); // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. - const MCExpr *ExtraImmOp = nullptr; if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; - unsigned SSECCIdx = IsVCMP ? 4 : 3; - unsigned SSEComparisonCode = StringSwitch<unsigned>( - PatchedName.slice(SSECCIdx, PatchedName.size() - 2)) + unsigned CCIdx = IsVCMP ? 4 : 3; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(CCIdx, PatchedName.size() - 2)) .Case("eq", 0x00) .Case("lt", 0x01) .Case("le", 0x02) @@ -2006,27 +2056,75 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) { - ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode, - getParser().getContext()); - if (PatchedName.endswith("ss")) { - PatchedName = IsVCMP ? "vcmpss" : "cmpss"; - } else if (PatchedName.endswith("sd")) { - PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; - } else if (PatchedName.endswith("ps")) { - PatchedName = IsVCMP ? "vcmpps" : "cmpps"; - } else { - assert(PatchedName.endswith("pd") && "Unexpected mnemonic!"); - PatchedName = IsVCMP ? "vcmppd" : "cmppd"; - } + if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) { + + Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx), + NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - 2); + } + } + + // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcmp") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("eq", 0x0) // Only allowed on unsigned. Checked below. + .Case("lt", 0x1) + .Case("le", 0x2) + //.Case("false", 0x3) // Not a documented alias. + .Case("neq", 0x4) + .Case("nlt", 0x5) + .Case("nle", 0x6) + //.Case("true", 0x7) // Not a documented alias. + .Default(~0U); + if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) { + Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + } + } + + // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcom") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("lt", 0x0) + .Case("le", 0x1) + .Case("gt", 0x2) + .Case("ge", 0x3) + .Case("eq", 0x4) + .Case("neq", 0x5) + .Case("false", 0x6) + .Case("true", 0x7) + .Default(~0U); + if (ComparisonCode != ~0U) { + Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::Create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); } } Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); - if (ExtraImmOp && !isParsingIntelSyntax()) - Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); - // Determine whether this is an instruction prefix. bool isPrefix = Name == "lock" || Name == "rep" || @@ -2072,9 +2170,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (isPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); - if (ExtraImmOp && isParsingIntelSyntax()) - Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); - // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2239,8 +2334,8 @@ static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg, MCInst TmpInst; TmpInst.setOpcode(Opcode); if (!isCmp) - TmpInst.addOperand(MCOperand::CreateReg(Reg)); - TmpInst.addOperand(MCOperand::CreateReg(Reg)); + TmpInst.addOperand(MCOperand::createReg(Reg)); + TmpInst.addOperand(MCOperand::createReg(Reg)); TmpInst.addOperand(Inst.getOperand(0)); Inst = TmpInst; return true; @@ -2485,7 +2580,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, SmallString<16> Tmp; Tmp += Base; Tmp += ' '; - Op.setTokenValue(Tmp.str()); + Op.setTokenValue(Tmp); // If this instruction starts with an 'f', then it is a floating point stack // instruction. These come in up to three forms for 32-bit, 64-bit, and @@ -2668,7 +2763,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } // If we haven't matched anything yet, this is not a basic integer or FPU - // operation. There shouldn't be any ambiguity in our mneumonic table, so try + // operation. There shouldn't be any ambiguity in our mnemonic table, so try // matching with the unsized operand. if (Match.empty()) { Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 72aeeaa..7610806 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -34,6 +34,11 @@ inline bool isImmSExti64i32Value(uint64_t Value) { (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } +inline bool isImmUnsignedi8Value(uint64_t Value) { + return (( Value <= 0x00000000000000FFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); +} + } // End of namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h index 6675d4d..b3066ef 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -12,8 +12,11 @@ #include "X86AsmParserCommon.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/ADT/STLExtras.h" +#include "MCTargetDesc/X86MCTargetDesc.h" namespace llvm { @@ -187,6 +190,13 @@ struct X86Operand : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + bool isImmUnsignedi8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi8Value(CE->getValue()); + } + bool isOffsetOf() const override { return OffsetOfLoc.getPointer(); } @@ -253,15 +263,14 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && !getMemIndexReg() && getMemScale() == 1; } + bool isAVX512RC() const{ + return isImm(); + } bool isAbsMem16() const { return isAbsMem() && Mem.ModeSize == 16; } - bool isAbsMem32() const { - return isAbsMem() && Mem.ModeSize != 16; - } - bool isSrcIdx() const { return !getMemIndexReg() && getMemScale() == 1 && (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI || @@ -351,14 +360,14 @@ struct X86Operand : public MCParsedAsmOperand { void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) - Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + Inst.addOperand(MCOperand::createImm(CE->getValue())); else - Inst.addOperand(MCOperand::CreateExpr(Expr)); + Inst.addOperand(MCOperand::createExpr(Expr)); } void addRegOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getReg())); + Inst.addOperand(MCOperand::createReg(getReg())); } static unsigned getGR32FromGR64(unsigned RegNo) { @@ -389,9 +398,12 @@ struct X86Operand : public MCParsedAsmOperand { unsigned RegNo = getReg(); if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) RegNo = getGR32FromGR64(RegNo); - Inst.addOperand(MCOperand::CreateReg(RegNo)); + Inst.addOperand(MCOperand::createReg(RegNo)); + } + void addAVX512RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); } - void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); @@ -399,40 +411,40 @@ struct X86Operand : public MCParsedAsmOperand { void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); - Inst.addOperand(MCOperand::CreateImm(getMemScale())); - Inst.addOperand(MCOperand::CreateReg(getMemIndexReg())); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + Inst.addOperand(MCOperand::createImm(getMemScale())); + Inst.addOperand(MCOperand::createReg(getMemIndexReg())); addExpr(Inst, getMemDisp()); - Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); } void addAbsMemOperands(MCInst &Inst, unsigned N) const { assert((N == 1) && "Invalid number of operands!"); // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) - Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + Inst.addOperand(MCOperand::createImm(CE->getValue())); else - Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); + Inst.addOperand(MCOperand::createExpr(getMemDisp())); } void addSrcIdxOperands(MCInst &Inst, unsigned N) const { assert((N == 2) && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); - Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); } void addDstIdxOperands(MCInst &Inst, unsigned N) const { assert((N == 1) && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); } void addMemOffsOperands(MCInst &Inst, unsigned N) const { assert((N == 2) && "Invalid number of operands!"); // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) - Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + Inst.addOperand(MCOperand::createImm(CE->getValue())); else - Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); - Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); + Inst.addOperand(MCOperand::createExpr(getMemDisp())); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); } static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 1c56182..3469d19 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -80,20 +80,19 @@ X86GenericDisassembler::X86GenericDisassembler( MCContext &Ctx, std::unique_ptr<const MCInstrInfo> MII) : MCDisassembler(STI, Ctx), MII(std::move(MII)) { - switch (STI.getFeatureBits() & - (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) { - case X86::Mode16Bit: + const FeatureBitset &FB = STI.getFeatureBits(); + if (FB[X86::Mode16Bit]) { fMode = MODE_16BIT; - break; - case X86::Mode32Bit: + return; + } else if (FB[X86::Mode32Bit]) { fMode = MODE_32BIT; - break; - case X86::Mode64Bit: + return; + } else if (FB[X86::Mode64Bit]) { fMode = MODE_64BIT; - break; - default: - llvm_unreachable("Invalid CPU mode"); + return; } + + llvm_unreachable("Invalid CPU mode"); } struct Region { @@ -180,7 +179,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) { #undef ENTRY uint8_t llvmRegnum = llvmRegnums[reg]; - mcInst.addOperand(MCOperand::CreateReg(llvmRegnum)); + mcInst.addOperand(MCOperand::createReg(llvmRegnum)); } /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the @@ -248,11 +247,11 @@ static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) { assert(insn.mode == MODE_16BIT); baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI; } - MCOperand baseReg = MCOperand::CreateReg(baseRegNo); + MCOperand baseReg = MCOperand::createReg(baseRegNo); mcInst.addOperand(baseReg); MCOperand segmentReg; - segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); mcInst.addOperand(segmentReg); return false; } @@ -273,7 +272,7 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) { assert(insn.mode == MODE_16BIT); baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI; } - MCOperand baseReg = MCOperand::CreateReg(baseRegNo); + MCOperand baseReg = MCOperand::createReg(baseRegNo); mcInst.addOperand(baseReg); return false; } @@ -320,24 +319,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, // By default sign-extend all X86 immediates based on their encoding. else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || type == TYPE_IMM64 || type == TYPE_IMMv) { - uint32_t Opcode = mcInst.getOpcode(); switch (operand.encoding) { default: break; case ENCODING_IB: - // Special case those X86 instructions that use the imm8 as a set of - // bits, bit count, etc. and are not sign-extend. - if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri && - Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && - Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && - Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && - Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && - Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && - Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && - Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && - Opcode != X86::VINSERTPSrr) - if(immediate & 0x80) - immediate |= ~(0xffull); + if(immediate & 0x80) + immediate |= ~(0xffull); break; case ENCODING_IW: if(immediate & 0x8000) @@ -356,14 +343,30 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, unsigned NewOpc; switch (mcInst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); - case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; - case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; - case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; - case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; - case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; - case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; - case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; - case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; + case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; + case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; + case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; + case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; + case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; + case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; + case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; + case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; + case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break; + case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break; + case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break; + case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break; + case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break; + case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break; + case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break; + case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break; + case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break; + case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break; + case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break; + case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break; + case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break; + case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break; + case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break; + case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break; } // Switch opcode to the one that doesn't get special printing. mcInst.setOpcode(NewOpc); @@ -374,26 +377,157 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, unsigned NewOpc; switch (mcInst.getOpcode()) { default: llvm_unreachable("unexpected opcode"); - case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; - case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; - case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; - case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; - case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; - case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; - case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; - case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; - case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; - case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; - case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; - case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; - case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; - case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; - case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; - case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; - case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; - case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; - case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; - case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_AVX512ICC) { + if (immediate >= 8 || ((immediate & 0x3) == 3)) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break; + case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break; + case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break; + case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break; + case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break; + case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break; + case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break; + case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break; + case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break; + case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break; + case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break; + case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break; + case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break; + case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break; + case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break; + case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break; + case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break; + case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break; + case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break; + case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break; + case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break; + case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break; + case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break; + case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break; + case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break; + case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break; + case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break; + case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break; + case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break; + case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break; + case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break; + case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break; + case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break; + case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break; + case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break; + case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break; + case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break; + case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break; + case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break; + case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break; + case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break; + case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break; + case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break; + case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break; + case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break; + case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break; + case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break; + case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break; + case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break; + case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break; + case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break; + case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break; + case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break; + case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break; + case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break; + case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break; + case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break; + case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break; + case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break; + case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break; + case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break; } // Switch opcode to the one that doesn't get special printing. mcInst.setOpcode(NewOpc); @@ -404,13 +538,13 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_XMM32: case TYPE_XMM64: case TYPE_XMM128: - mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); + mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4))); return; case TYPE_XMM256: - mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); + mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4))); return; case TYPE_XMM512: - mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4))); + mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4))); return; case TYPE_REL8: isBranch = true; @@ -433,12 +567,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, insn.immediateOffset, insn.immediateSize, mcInst, Dis)) - mcInst.addOperand(MCOperand::CreateImm(immediate)); + mcInst.addOperand(MCOperand::createImm(immediate)); if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 || type == TYPE_MOFFS32 || type == TYPE_MOFFS64) { MCOperand segmentReg; - segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); mcInst.addOperand(segmentReg); } } @@ -471,7 +605,7 @@ static bool translateRMRegister(MCInst &mcInst, return true; #define ENTRY(x) \ case EA_REG_##x: \ - mcInst.addOperand(MCOperand::CreateReg(X86::x)); break; + mcInst.addOperand(MCOperand::createReg(X86::x)); break; ALL_REGS #undef ENTRY } @@ -516,12 +650,12 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, return true; #define ENTRY(x) \ case SIB_BASE_##x: \ - baseReg = MCOperand::CreateReg(X86::x); break; + baseReg = MCOperand::createReg(X86::x); break; ALL_SIB_BASES #undef ENTRY } } else { - baseReg = MCOperand::CreateReg(0); + baseReg = MCOperand::createReg(0); } // Check whether we are handling VSIB addressing mode for GATHER. @@ -571,7 +705,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, return true; #define ENTRY(x) \ case SIB_INDEX_##x: \ - indexReg = MCOperand::CreateReg(X86::x); break; + indexReg = MCOperand::createReg(X86::x); break; EA_BASES_32BIT EA_BASES_64BIT REGS_XMM @@ -580,10 +714,10 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, #undef ENTRY } } else { - indexReg = MCOperand::CreateReg(0); + indexReg = MCOperand::createReg(0); } - scaleAmount = MCOperand::CreateImm(insn.sibScale); + scaleAmount = MCOperand::createImm(insn.sibScale); } else { switch (insn.eaBase) { case EA_BASE_NONE: @@ -597,31 +731,31 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, tryAddingPcLoadReferenceComment(insn.startLocation + insn.displacementOffset, insn.displacement + pcrel, Dis); - baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6 + baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6 } else - baseReg = MCOperand::CreateReg(0); + baseReg = MCOperand::createReg(0); - indexReg = MCOperand::CreateReg(0); + indexReg = MCOperand::createReg(0); break; case EA_BASE_BX_SI: - baseReg = MCOperand::CreateReg(X86::BX); - indexReg = MCOperand::CreateReg(X86::SI); + baseReg = MCOperand::createReg(X86::BX); + indexReg = MCOperand::createReg(X86::SI); break; case EA_BASE_BX_DI: - baseReg = MCOperand::CreateReg(X86::BX); - indexReg = MCOperand::CreateReg(X86::DI); + baseReg = MCOperand::createReg(X86::BX); + indexReg = MCOperand::createReg(X86::DI); break; case EA_BASE_BP_SI: - baseReg = MCOperand::CreateReg(X86::BP); - indexReg = MCOperand::CreateReg(X86::SI); + baseReg = MCOperand::createReg(X86::BP); + indexReg = MCOperand::createReg(X86::SI); break; case EA_BASE_BP_DI: - baseReg = MCOperand::CreateReg(X86::BP); - indexReg = MCOperand::CreateReg(X86::DI); + baseReg = MCOperand::createReg(X86::BP); + indexReg = MCOperand::createReg(X86::DI); break; default: - indexReg = MCOperand::CreateReg(0); + indexReg = MCOperand::createReg(0); switch (insn.eaBase) { default: debug("Unexpected eaBase"); @@ -632,7 +766,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, // placeholders to keep the compiler happy. #define ENTRY(x) \ case EA_BASE_##x: \ - baseReg = MCOperand::CreateReg(X86::x); break; + baseReg = MCOperand::createReg(X86::x); break; ALL_EA_BASES #undef ENTRY #define ENTRY(x) case EA_REG_##x: @@ -644,12 +778,12 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } } - scaleAmount = MCOperand::CreateImm(1); + scaleAmount = MCOperand::createImm(1); } - displacement = MCOperand::CreateImm(insn.displacement); + displacement = MCOperand::createImm(insn.displacement); - segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); mcInst.addOperand(baseReg); mcInst.addOperand(scaleAmount); @@ -721,7 +855,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, /// @param stackPos - The stack position to translate. static void translateFPRegister(MCInst &mcInst, uint8_t stackPos) { - mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos)); + mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos)); } /// translateMaskRegister - Translates a 3-bit mask register number to @@ -737,7 +871,7 @@ static bool translateMaskRegister(MCInst &mcInst, return true; } - mcInst.addOperand(MCOperand::CreateReg(X86::K0 + maskRegNum)); + mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum)); return false; } diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 619a0d4..d990bf3 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -310,11 +310,8 @@ static bool isPrefixAtLocation(struct InternalInstruction* insn, uint8_t prefix, uint64_t location) { - if (insn->prefixPresent[prefix] == 1 && - insn->prefixLocations[prefix] == location) - return true; - else - return false; + return insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location; } /* @@ -1369,16 +1366,17 @@ static int readModRM(struct InternalInstruction* insn) { switch (mod) { case 0x0: insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ - switch (rm) { - case 0x14: - case 0x4: - case 0xc: /* in case REXW.b is set */ + // In determining whether RIP-relative mode is used (rm=5), + // or whether a SIB byte is present (rm=4), + // the extension bits (REX.b and EVEX.x) are ignored. + switch (rm & 7) { + case 0x4: // SIB byte is present insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64); if (readSIB(insn) || readDisplacement(insn)) return -1; break; - case 0x5: + case 0x5: // RIP-relative insn->eaBase = EA_BASE_NONE; insn->eaDisplacement = EA_DISP_32; if (readDisplacement(insn)) @@ -1394,10 +1392,8 @@ static int readModRM(struct InternalInstruction* insn) { /* FALLTHROUGH */ case 0x2: insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); - switch (rm) { - case 0x14: - case 0x4: - case 0xc: /* in case REXW.b is set */ + switch (rm & 7) { + case 0x4: // SIB byte is present insn->eaBase = EA_BASE_sib; if (readSIB(insn) || readDisplacement(insn)) return -1; @@ -1458,6 +1454,8 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_VK1: \ case TYPE_VK8: \ case TYPE_VK16: \ + if (index > 7) \ + *valid = 0; \ return prefix##_K0 + index; \ case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 1f8f9da..9e65050 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -406,6 +406,8 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_IMM64, "8-byte") \ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ + ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ + ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ ENUM_ENTRY(TYPE_RM16, "2-byte") \ ENUM_ENTRY(TYPE_RM32, "4-byte") \ @@ -483,18 +485,6 @@ struct OperandSpecifier { uint8_t type; }; -// Indicates where the opcode modifier (if any) is to be found. Extended -// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. -#define MODIFIER_TYPES \ - ENUM_ENTRY(MODIFIER_NONE) - -#define ENUM_ENTRY(n) n, -enum ModifierType { - MODIFIER_TYPES - MODIFIER_max -}; -#undef ENUM_ENTRY - static const unsigned X86_MAX_OPERANDS = 6; /// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index db69485..af4399a 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -33,15 +33,12 @@ using namespace llvm; #define PRINT_ALIAS_INSTR #include "X86GenAsmWriter.inc" -void X86ATTInstPrinter::printRegName(raw_ostream &OS, - unsigned RegNo) const { - OS << markup("<reg:") - << '%' << getRegisterName(RegNo) - << markup(">"); +void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">"); } void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot) { + StringRef Annot, const MCSubtargetInfo &STI) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t TSFlags = Desc.TSFlags; @@ -51,7 +48,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); if (TSFlags & X86II::LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". @@ -60,7 +57,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // InstrInfo.td as soon as Requires clause is supported properly // for InstAlias. if (MI->getOpcode() == X86::CALLpcrel32 && - (getAvailableFeatures() & X86::Mode64Bit) != 0) { + (STI.getFeatureBits()[X86::Mode64Bit])) { OS << "\tcallq\t"; printPCRelImm(MI, 0, OS); } @@ -72,7 +69,9 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, printAnnotation(OS, Annot); } -static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { +void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); switch (Imm) { default: llvm_unreachable("Invalid ssecc/avxcc argument!"); case 0: O << "eq"; break; @@ -110,22 +109,24 @@ static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { } } -void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, +void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm(); - assert((Imm & 0x7) == Imm); // Ensure valid immediate. - printSSEAVXCC(Imm, O); -} - -void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - assert((Imm & 0x1f) == Imm); // Ensure valid immediate. - printSSEAVXCC(Imm, O); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } } void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, - raw_ostream &O) { + raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; switch (Imm) { case 0: O << "{rn-sae}"; break; @@ -165,8 +166,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, printRegName(O, Op.getReg()); } else if (Op.isImm()) { // Print X86 immediates as signed values. - O << markup("<imm:") - << '$' << formatImm((int64_t)Op.getImm()) + O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); // If there are no instruction-specific comments, add a comment clarifying @@ -178,24 +178,22 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << markup("<imm:") - << '$' << *Op.getExpr() - << markup(">"); + O << markup("<imm:") << '$' << *Op.getExpr() << markup(">"); } } void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O) { - const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); - const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); - const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); - const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); + const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); + const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg); O << markup("<mem:"); // If this has a segment register, print it. if (SegReg.getReg()) { - printOperand(MI, Op+X86::AddrSegmentReg, O); + printOperand(MI, Op + X86::AddrSegmentReg, O); O << ':'; } @@ -211,16 +209,14 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, if (IndexReg.getReg() || BaseReg.getReg()) { O << '('; if (BaseReg.getReg()) - printOperand(MI, Op+X86::AddrBaseReg, O); + printOperand(MI, Op + X86::AddrBaseReg, O); if (IndexReg.getReg()) { O << ','; - printOperand(MI, Op+X86::AddrIndexReg, O); - unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + printOperand(MI, Op + X86::AddrIndexReg, O); + unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); if (ScaleVal != 1) { - O << ',' - << markup("<imm:") - << ScaleVal // never printed in hex. + O << ',' << markup("<imm:") << ScaleVal // never printed in hex. << markup(">"); } } @@ -232,13 +228,13 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, raw_ostream &O) { - const MCOperand &SegReg = MI->getOperand(Op+1); + const MCOperand &SegReg = MI->getOperand(Op + 1); O << markup("<mem:"); // If this has a segment register, print it. if (SegReg.getReg()) { - printOperand(MI, Op+1, O); + printOperand(MI, Op + 1, O); O << ':'; } @@ -263,13 +259,13 @@ void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, raw_ostream &O) { const MCOperand &DispSpec = MI->getOperand(Op); - const MCOperand &SegReg = MI->getOperand(Op+1); + const MCOperand &SegReg = MI->getOperand(Op + 1); O << markup("<mem:"); // If this has a segment register, print it. if (SegReg.getReg()) { - printOperand(MI, Op+1, O); + printOperand(MI, Op + 1, O); O << ':'; } @@ -282,3 +278,9 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, O << markup(">"); } + +void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) + << markup(">"); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 649dc84..62b6b73 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -24,14 +24,12 @@ class MCOperand; class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) - : MCInstPrinter(MAI, MII, MRI) { - // Initialize the set of available features. - setAvailableFeatures(STI.getFeatureBits()); - } + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; // Autogenerated by tblgen, returns true if we successfully printed an // alias. @@ -45,13 +43,14 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); - void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); @@ -141,7 +140,6 @@ public: private: bool HasCustomInstComment; }; - } #endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index a4c0ca8..3cad9fa 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -21,6 +21,92 @@ using namespace llvm; +/// \brief Extracts the src/dst types for a given zero extension instruction. +/// \note While the number of elements in DstVT type correct, the +/// number in the SrcVT type is expanded to fill the src xmm register and the +/// upper elements may not be included in the dst xmm/ymm register. +static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown zero extension instruction"); + // i8 zero extension + case X86::PMOVZXBWrm: + case X86::PMOVZXBWrr: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBWrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i16; + break; + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBWYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v16i16; + break; + case X86::PMOVZXBDrm: + case X86::PMOVZXBDrr: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBDrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBDYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXBQrm: + case X86::PMOVZXBQrr: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXBQrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXBQYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i64; + break; + // i16 zero extension + case X86::PMOVZXWDrm: + case X86::PMOVZXWDrr: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWDrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWDYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXWQrm: + case X86::PMOVZXWQrr: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXWQrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXWQYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i64; + break; + // i32 zero extension + case X86::PMOVZXDQrm: + case X86::PMOVZXDQrr: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXDQrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXDQYrm: + case X86::VPMOVZXDQYrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v4i64; + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -45,9 +131,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::BLENDPDrmi: case X86::VBLENDPDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -56,9 +142,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VBLENDPDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -70,9 +156,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::BLENDPSrmi: case X86::VBLENDPSrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -81,9 +167,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VBLENDPSYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -95,9 +181,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PBLENDWrmi: case X86::VPBLENDWrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -106,9 +192,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VPBLENDWYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -118,9 +204,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VPBLENDDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -130,9 +216,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VPBLENDDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeBLENDMask(MVT::v8i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -146,8 +232,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VINSERTPSrm: DestName = getRegName(MI->getOperand(0).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; @@ -203,22 +289,40 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); break; + case X86::VMOVDDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVDDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); + break; + + case X86::MOVDDUPrr: + case X86::VMOVDDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVDDUPrm: + case X86::VMOVDDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + break; + case X86::PSLLDQri: case X86::VPSLLDQri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSLLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPSLLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSLLDQMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; @@ -226,18 +330,18 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSRLDQri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSRLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPSRLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSRLDQMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; @@ -249,9 +353,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPALIGNR128rm: Src2Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePALIGNRMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPALIGNR256rr: @@ -260,9 +364,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPALIGNR256rm: Src2Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePALIGNRMask(MVT::v32i8, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; @@ -273,9 +377,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSHUFDmi: case X86::VPSHUFDmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPSHUFDYri: @@ -283,13 +387,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::VPSHUFDYmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v8i32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; - case X86::PSHUFHWri: case X86::VPSHUFHWri: Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -297,9 +400,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSHUFHWmi: case X86::VPSHUFHWmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFHWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPSHUFHWYri: @@ -307,9 +410,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::VPSHUFHWYmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFHWMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFLWri: @@ -319,9 +422,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSHUFLWmi: case X86::VPSHUFLWmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFLWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::VPSHUFLWYri: @@ -329,9 +432,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::VPSHUFLWYmi: DestName = getRegName(MI->getOperand(0).getReg()); - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFLWMask(MVT::v16i16, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; @@ -519,9 +622,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::SHUFPDrmi: case X86::VSHUFPDrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeSHUFPMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -530,9 +633,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VSHUFPDYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeSHUFPMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -544,9 +647,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::SHUFPSrmi: case X86::VSHUFPSrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeSHUFPMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -555,9 +658,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::VSHUFPSYrmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeSHUFPMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -671,9 +774,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::VPERMILPSmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -681,9 +784,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::VPERMILPSYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -691,9 +794,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::VPERMILPDmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -701,9 +804,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::VPERMILPDYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodePSHUFMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; @@ -714,9 +817,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPERM2F128rm: case X86::VPERM2I128rm: // For instruction comments purpose, assume the 256-bit vector is v4i64. - if(MI->getOperand(MI->getNumOperands()-1).isImm()) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) DecodeVPERM2X128Mask(MVT::v4i64, - MI->getOperand(MI->getNumOperands()-1).getImm(), + MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -727,11 +830,97 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::VPERMQYmi: case X86::VPERMPDYmi: - if(MI->getOperand(MI->getNumOperands()-1).isImm()) - DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeVPERMMask(MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + + case X86::MOVSDrr: + case X86::VMOVSDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSDrm: + case X86::VMOVSDrm: + DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVSSrr: + case X86::VMOVSSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSSrm: + case X86::VMOVSSrm: + DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVPQI2QIrr: + case X86::MOVZPQILo2PQIrr: + case X86::VMOVPQI2QIrr: + case X86::VMOVZPQILo2PQIrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVQI2PQIrm: + case X86::MOVZQI2PQIrm: + case X86::MOVZPQILo2PQIrm: + case X86::VMOVQI2PQIrm: + case X86::VMOVZQI2PQIrm: + case X86::VMOVZPQILo2PQIrm: + DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVDI2PDIrm: + case X86::VMOVDI2PDIrm: + DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PMOVZXBWrr: + case X86::PMOVZXBDrr: + case X86::PMOVZXBQrr: + case X86::PMOVZXWDrr: + case X86::PMOVZXWQrr: + case X86::PMOVZXDQrr: + case X86::VPMOVZXBWrr: + case X86::VPMOVZXBDrr: + case X86::VPMOVZXBQrr: + case X86::VPMOVZXWDrr: + case X86::VPMOVZXWQrr: + case X86::VPMOVZXDQrr: + case X86::VPMOVZXBWYrr: + case X86::VPMOVZXBDYrr: + case X86::VPMOVZXBQYrr: + case X86::VPMOVZXWDYrr: + case X86::VPMOVZXWQYrr: + case X86::VPMOVZXDQYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PMOVZXBWrm: + case X86::PMOVZXBDrm: + case X86::PMOVZXBQrm: + case X86::PMOVZXWDrm: + case X86::PMOVZXWQrm: + case X86::PMOVZXDQrm: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXDQYrm: { + MVT SrcVT, DstVT; + getZeroExtensionTypes(MI, SrcVT, DstVT); + DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + } break; } // The only comments we decode are shuffles, so give up if we were unable to @@ -747,7 +936,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, if (Src1Name == Src2Name) { for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] >= (int)e) // From second mask. ShuffleMask[i] -= e; } } @@ -782,7 +971,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, ++i; } OS << ']'; - --i; // For loop increments element #. + --i; // For loop increments element #. } //MI->print(OS, 0); OS << "\n"; diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 449445d..4d92daf 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -33,7 +33,8 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { } void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot) { + StringRef Annot, + const MCSubtargetInfo &STI) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t TSFlags = Desc.TSFlags; @@ -50,7 +51,9 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); } -static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { +void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); switch (Imm) { default: llvm_unreachable("Invalid avxcc argument!"); case 0: O << "eq"; break; @@ -88,22 +91,24 @@ static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { } } -void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm(); - assert((Imm & 0x7) == Imm); // Ensure valid immediate. - printSSEAVXCC(Imm, O); -} - -void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, +void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm(); - assert((Imm & 0x1f) == Imm); // Ensure valid immediate. - printSSEAVXCC(Imm, O); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } } void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, - raw_ostream &O) { + raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; switch (Imm) { case 0: O << "{rn-sae}"; break; @@ -245,3 +250,8 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, O << ']'; } + +void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << formatImm(MI->getOperand(Op).getImm() & 0xff); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 641423d..6e371da 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -28,7 +28,8 @@ public: : MCInstPrinter(MAI, MII, MRI) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; // Autogenerated by tblgen. void printInstruction(const MCInst *MI, raw_ostream &O); @@ -36,13 +37,14 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); - void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); - void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O); void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 164b419..6d4284d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -14,8 +14,10 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" @@ -76,8 +78,8 @@ class X86AsmBackend : public MCAsmBackend { bool HasNopl; const uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef _CPU) - : MCAsmBackend(), CPU(_CPU), MaxNopLength(_CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) + : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -351,8 +353,8 @@ namespace { class ELFX86AsmBackend : public X86AsmBackend { public: uint8_t OSABI; - ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU) - : X86AsmBackend(T, CPU), OSABI(_OSABI) {} + ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(OSABI) {} }; class ELFX86_32AsmBackend : public ELFX86AsmBackend { @@ -360,7 +362,7 @@ public: ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386); } }; @@ -370,7 +372,7 @@ public: ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_X86_64); } @@ -381,7 +383,7 @@ public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64); } }; @@ -395,7 +397,7 @@ public: , Is64Bit(is64Bit) { } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86WinCOFFObjectWriter(OS, Is64Bit); } }; @@ -752,7 +754,7 @@ public: StringRef CPU) : DarwinX86AsmBackend(T, MRI, CPU, false) {} - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_I386, MachO::CPU_SUBTYPE_I386_ALL); @@ -772,24 +774,11 @@ public: StringRef CPU, MachO::CPUSubTypeX86 st) : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, MachO::CPU_TYPE_X86_64, Subtype); } - bool doesSectionRequireSymbols(const MCSection &Section) const override { - // Temporary labels in the string literals sections require symbols. The - // issue is that the x86_64 relocation format does not allow symbol + - // offset, and so the linker does not have enough information to resolve the - // access to the appropriate atom unless an external relocation is used. For - // non-cstring sections, we expect the compiler to use a non-temporary label - // for anything that could have an addend pointing outside the symbol. - // - // See <rdar://problem/4765733>. - const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); - return SMO.getType() == MachO::S_CSTRING_LITERALS; - } - /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 28be15b..85b0006 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -213,7 +213,11 @@ namespace X86II { /// the offset from beginning of section. /// /// This is the TLS offset for the COFF/Windows TLS mechanism. - MO_SECREL + MO_SECREL, + + /// MO_NOPREFIX - On a symbol operand this indicates that the symbol should + /// not be mangled with a prefix. + MO_NOPREFIX, }; enum : uint64_t { @@ -302,19 +306,21 @@ namespace X86II { //// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, - MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, - MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43, - MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47, - MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51, - MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55, - MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59, - MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63, - MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67, - MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71, - MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75, - MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79, - MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83, - MRM_FF = 84, + MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39, + MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43, + MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47, + MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51, + MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55, + MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59, + MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63, + MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67, + MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71, + MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75, + MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79, + MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83, + MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87, + MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91, + MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95, FormMask = 127, diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index e8b0b4c..4508883 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -22,7 +22,8 @@ namespace { public: X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); - virtual ~X86ELFObjectWriter(); + ~X86ELFObjectWriter() override; + protected: unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; @@ -38,236 +39,218 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, X86ELFObjectWriter::~X86ELFObjectWriter() {} -unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { - // determine the type of the relocation - - MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); - unsigned Type; - if (getEMachine() == ELF::EM_X86_64) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); +enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; - case FK_Data_8: Type = ELF::R_X86_64_PC64; break; - case FK_Data_4: Type = ELF::R_X86_64_PC32; break; - case FK_Data_2: Type = ELF::R_X86_64_PC16; break; - case FK_Data_1: Type = ELF::R_X86_64_PC8; break; +static X86_64RelType getType64(unsigned Kind, + MCSymbolRefExpr::VariantKind &Modifier, + bool &IsPCRel) { + switch (Kind) { + default: + llvm_unreachable("Unimplemented"); + case X86::reloc_global_offset_table8: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_64; + case FK_Data_8: + return RT64_64; + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel) + return RT64_32S; + return RT64_32; + case X86::reloc_global_offset_table: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_32; + case FK_Data_4: + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return RT64_32; + case FK_Data_2: + return RT64_16; + case FK_PCRel_1: + case FK_Data_1: + return RT64_8; + } +} - case FK_PCRel_8: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC64; - break; - case X86::reloc_signed_4byte: - case X86::reloc_riprel_4byte_movq_load: - case X86::reloc_riprel_4byte: - case FK_PCRel_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_X86_64_PLT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_X86_64_GOTTPOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_X86_64_TLSGD; - break; - case MCSymbolRefExpr::VK_TLSLD: - Type = ELF::R_X86_64_TLSLD; - break; - } - break; - case FK_PCRel_2: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC16; - break; - case FK_PCRel_1: - assert(Modifier == MCSymbolRefExpr::VK_None); - Type = ELF::R_X86_64_PC8; - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - case X86::reloc_global_offset_table8: - Type = ELF::R_X86_64_GOTPC64; - break; - case X86::reloc_global_offset_table: - Type = ELF::R_X86_64_GOTPC32; - break; - case FK_Data_8: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_64; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT64; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_X86_64_GOTOFF64; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF64; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF64; - break; - } - break; - case X86::reloc_signed_4byte: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_X86_64_32S; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_X86_64_GOT32; - break; - case MCSymbolRefExpr::VK_GOTPCREL: - Type = ELF::R_X86_64_GOTPCREL; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_X86_64_TPOFF32; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_X86_64_DTPOFF32; - break; - } - break; - case FK_Data_4: - Type = ELF::R_X86_64_32; - break; - case FK_Data_2: Type = ELF::R_X86_64_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_X86_64_8; break; - } +static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier, + X86_64RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; + case RT64_32S: + return ELF::R_X86_64_32S; + case RT64_16: + return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16; + case RT64_8: + return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; } - } else if (getEMachine() == ELF::EM_386) { - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); - - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; - - case FK_PCRel_1: - case FK_Data_1: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC8; - break; - } - break; - - case FK_PCRel_2: - case FK_Data_2: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC16; - break; - } - break; + case MCSymbolRefExpr::VK_GOT: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT64_64); + assert(!IsPCRel); + return ELF::R_X86_64_GOTOFF64; + case MCSymbolRefExpr::VK_TPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_TPOFF64; + case RT64_32: + return ELF::R_X86_64_TPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_DTPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_DTPOFF64; + case RT64_32: + return ELF::R_X86_64_DTPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_SIZE: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_SIZE64; + case RT64_32: + return ELF::R_X86_64_SIZE32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSGD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTTPOFF; + case MCSymbolRefExpr::VK_TLSLD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSLD; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT64_32); + return ELF::R_X86_64_PLT32; + case MCSymbolRefExpr::VK_GOTPCREL: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTPCREL; + } +} - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_PC32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - } - break; - } - } else { - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("invalid fixup kind!"); +enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; - case X86::reloc_global_offset_table: - Type = ELF::R_386_GOTPC; - break; +static X86_32RelType getType32(X86_64RelType T) { + switch (T) { + case RT64_64: + llvm_unreachable("Unimplemented"); + case RT64_32: + case RT64_32S: + return RT32_32; + case RT64_16: + return RT32_16; + case RT64_8: + return RT32_8; + } + llvm_unreachable("unexpected relocation type!"); +} - // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode - // instead? - case X86::reloc_signed_4byte: - case FK_PCRel_4: - case FK_Data_4: - switch (Modifier) { - default: - llvm_unreachable("Unimplemented"); - case MCSymbolRefExpr::VK_None: - Type = ELF::R_386_32; - break; - case MCSymbolRefExpr::VK_GOT: - Type = ELF::R_386_GOT32; - break; - case MCSymbolRefExpr::VK_PLT: - Type = ELF::R_386_PLT32; - break; - case MCSymbolRefExpr::VK_GOTOFF: - Type = ELF::R_386_GOTOFF; - break; - case MCSymbolRefExpr::VK_TLSGD: - Type = ELF::R_386_TLS_GD; - break; - case MCSymbolRefExpr::VK_TPOFF: - Type = ELF::R_386_TLS_LE_32; - break; - case MCSymbolRefExpr::VK_INDNTPOFF: - Type = ELF::R_386_TLS_IE; - break; - case MCSymbolRefExpr::VK_NTPOFF: - Type = ELF::R_386_TLS_LE; - break; - case MCSymbolRefExpr::VK_GOTNTPOFF: - Type = ELF::R_386_TLS_GOTIE; - break; - case MCSymbolRefExpr::VK_TLSLDM: - Type = ELF::R_386_TLS_LDM; - break; - case MCSymbolRefExpr::VK_DTPOFF: - Type = ELF::R_386_TLS_LDO_32; - break; - case MCSymbolRefExpr::VK_GOTTPOFF: - Type = ELF::R_386_TLS_IE_32; - break; - } - break; - case FK_Data_2: Type = ELF::R_386_16; break; - case FK_PCRel_1: - case FK_Data_1: Type = ELF::R_386_8; break; - } +static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier, + X86_32RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT32_32: + return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; + case RT32_16: + return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16; + case RT32_8: + return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; } - } else - llvm_unreachable("Unsupported ELF machine type."); + case MCSymbolRefExpr::VK_GOT: + assert(Type == RT32_32); + return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32; + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE_32; + case MCSymbolRefExpr::VK_DTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDO_32; + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE_32; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT32_32); + return ELF::R_386_PLT32; + case MCSymbolRefExpr::VK_INDNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE; + case MCSymbolRefExpr::VK_NTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE; + case MCSymbolRefExpr::VK_GOTNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GOTIE; + case MCSymbolRefExpr::VK_TLSLDM: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDM; + } +} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel); + if (getEMachine() == ELF::EM_X86_64) + return getRelocType64(Modifier, Type, IsPCRel); - return Type; + assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + return getRelocType32(Modifier, getType32(Type), IsPCRel); } -MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS, - bool IsELF64, - uint8_t OSABI, +MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, + bool IsELF64, uint8_t OSABI, uint16_t EMachine) { MCELFObjectTargetWriter *MOTW = new X86ELFObjectWriter(IsELF64, OSABI, EMachine); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index b679316..a39def9 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -34,9 +34,9 @@ public: uint64_t SymSize; SymI->getSize(SymSize); int64_t Addend; getELFRelocationAddend(Rel, Addend); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 5795514..bda35f2 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -132,11 +132,10 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { PrivateLabelPrefix = ".L"; PointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; - - // Use MSVC-compatible EH data. - ExceptionsType = ExceptionHandling::MSVC; } + ExceptionsType = ExceptionHandling::WinEH; + AssemblerDialect = AsmWriterFlavor; TextAlignFillValue = 0x90; @@ -155,7 +154,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { PrivateLabelPrefix = ".L"; PointerSize = 8; WinEHEncodingType = WinEH::EncodingType::Itanium; - ExceptionsType = ExceptionHandling::ItaniumWinEH; + ExceptionsType = ExceptionHandling::WinEH; } else { ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 5d92524..8aed7a4 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -30,8 +30,8 @@ using namespace llvm; namespace { class X86MCCodeEmitter : public MCCodeEmitter { - X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; - void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION; + X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; + void operator=(const X86MCCodeEmitter &) = delete; const MCInstrInfo &MCII; MCContext &Ctx; public: @@ -39,18 +39,18 @@ public: : MCII(mcii), Ctx(ctx) { } - ~X86MCCodeEmitter() {} + ~X86MCCodeEmitter() override {} bool is64BitMode(const MCSubtargetInfo &STI) const { - return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + return STI.getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode(const MCSubtargetInfo &STI) const { - return (STI.getFeatureBits() & X86::Mode32Bit) != 0; + return STI.getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode(const MCSubtargetInfo &STI) const { - return (STI.getFeatureBits() & X86::Mode16Bit) != 0; + return STI.getFeatureBits()[X86::Mode16Bit]; } /// Is16BitMemOperand - Return true if the specified instruction has @@ -149,7 +149,7 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -168,10 +168,8 @@ public: } // end anonymous namespace - MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx) { return new X86MCCodeEmitter(MCII, Ctx); } @@ -357,7 +355,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, Ctx); // Emit a symbolic constant as a fixup and 4 zeros. - Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc)); + Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc)); EmitConstant(0, Size, CurByte, OS); } @@ -1154,7 +1152,7 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } void X86MCCodeEmitter:: -EncodeInstruction(const MCInst &MI, raw_ostream &OS, +encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { unsigned Opcode = MI.getOpcode(); @@ -1426,83 +1424,31 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, break; } case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: - case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: - case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: - case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: - case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: - case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: - case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: - case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: - case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: - case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: - case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: EmitByte(BaseOpcode, CurByte, OS); - unsigned char MRM; - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Invalid Form"); - case X86II::MRM_C0: MRM = 0xC0; break; - case X86II::MRM_C1: MRM = 0xC1; break; - case X86II::MRM_C2: MRM = 0xC2; break; - case X86II::MRM_C3: MRM = 0xC3; break; - case X86II::MRM_C4: MRM = 0xC4; break; - case X86II::MRM_C8: MRM = 0xC8; break; - case X86II::MRM_C9: MRM = 0xC9; break; - case X86II::MRM_CA: MRM = 0xCA; break; - case X86II::MRM_CB: MRM = 0xCB; break; - case X86II::MRM_CF: MRM = 0xCF; break; - case X86II::MRM_D0: MRM = 0xD0; break; - case X86II::MRM_D1: MRM = 0xD1; break; - case X86II::MRM_D4: MRM = 0xD4; break; - case X86II::MRM_D5: MRM = 0xD5; break; - case X86II::MRM_D6: MRM = 0xD6; break; - case X86II::MRM_D7: MRM = 0xD7; break; - case X86II::MRM_D8: MRM = 0xD8; break; - case X86II::MRM_D9: MRM = 0xD9; break; - case X86II::MRM_DA: MRM = 0xDA; break; - case X86II::MRM_DB: MRM = 0xDB; break; - case X86II::MRM_DC: MRM = 0xDC; break; - case X86II::MRM_DD: MRM = 0xDD; break; - case X86II::MRM_DE: MRM = 0xDE; break; - case X86II::MRM_DF: MRM = 0xDF; break; - case X86II::MRM_E0: MRM = 0xE0; break; - case X86II::MRM_E1: MRM = 0xE1; break; - case X86II::MRM_E2: MRM = 0xE2; break; - case X86II::MRM_E3: MRM = 0xE3; break; - case X86II::MRM_E4: MRM = 0xE4; break; - case X86II::MRM_E5: MRM = 0xE5; break; - case X86II::MRM_E8: MRM = 0xE8; break; - case X86II::MRM_E9: MRM = 0xE9; break; - case X86II::MRM_EA: MRM = 0xEA; break; - case X86II::MRM_EB: MRM = 0xEB; break; - case X86II::MRM_EC: MRM = 0xEC; break; - case X86II::MRM_ED: MRM = 0xED; break; - case X86II::MRM_EE: MRM = 0xEE; break; - case X86II::MRM_F0: MRM = 0xF0; break; - case X86II::MRM_F1: MRM = 0xF1; break; - case X86II::MRM_F2: MRM = 0xF2; break; - case X86II::MRM_F3: MRM = 0xF3; break; - case X86II::MRM_F4: MRM = 0xF4; break; - case X86II::MRM_F5: MRM = 0xF5; break; - case X86II::MRM_F6: MRM = 0xF6; break; - case X86II::MRM_F7: MRM = 0xF7; break; - case X86II::MRM_F8: MRM = 0xF8; break; - case X86II::MRM_F9: MRM = 0xF9; break; - case X86II::MRM_FA: MRM = 0xFA; break; - case X86II::MRM_FB: MRM = 0xFB; break; - case X86II::MRM_FC: MRM = 0xFC; break; - case X86II::MRM_FD: MRM = 0xFD; break; - case X86II::MRM_FE: MRM = 0xFE; break; - case X86II::MRM_FF: MRM = 0xFF; break; - } - EmitByte(MRM, CurByte, OS); + uint64_t Form = TSFlags & X86II::FormMask; + EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); break; } @@ -1529,7 +1475,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, RegNum |= Val; } } - EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1, + EmitImmediate(MCOperand::createImm(RegNum), MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); } else { EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 5a9181d..8e3c721 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -55,148 +55,6 @@ std::string X86_MC::ParseX86Triple(StringRef TT) { return FS; } -/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the -/// specified arguments. If we can't run cpuid on the host, return true. -bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - int registers[4]; - __cpuid(registers, value); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the -/// 4 values in the specified arguments. If we can't run cpuid on the host, -/// return true. -bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { -#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) - #if defined(__GNUC__) - // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually. - asm ("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - // __cpuidex was added in MSVC++ 9.0 SP1 - #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; - #else - return true; - #endif - #else - return true; - #endif -#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) - #if defined(__GNUC__) - asm ("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a" (*rEAX), - "=S" (*rEBX), - "=c" (*rECX), - "=d" (*rEDX) - : "a" (value), - "c" (subleaf)); - return false; - #elif defined(_MSC_VER) - __asm { - mov eax,value - mov ecx,subleaf - cpuid - mov esi,rEAX - mov dword ptr [esi],eax - mov esi,rEBX - mov dword ptr [esi],ebx - mov esi,rECX - mov dword ptr [esi],ecx - mov esi,rEDX - mov dword ptr [esi],edx - } - return false; - #else - return true; - #endif -#else - return true; -#endif -} - -void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, - unsigned &Model) { - Family = (EAX >> 8) & 0xf; // Bits 8 - 11 - Model = (EAX >> 4) & 0xf; // Bits 4 - 7 - if (Family == 6 || Family == 0xf) { - if (Family == 0xf) - // Examine extended family ID if family ID is F. - Family += (EAX >> 20) & 0xff; // Bits 20 - 27 - // Examine extended model ID if family ID is 6 or F. - Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 - } -} - unsigned X86_MC::getDwarfRegFlavour(Triple TT, bool isEH) { if (TT.getArch() == Triple::x86_64) return DWARFFlavour::X86_64; @@ -222,7 +80,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, std::string ArchFS = X86_MC::ParseX86Triple(TT); if (!FS.empty()) { if (!ArchFS.empty()) - ArchFS = ArchFS + "," + FS.str(); + ArchFS = (Twine(ArchFS) + "," + FS).str(); else ArchFS = FS; } @@ -345,36 +203,17 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, // 64-bit JIT places everything in the same buffer except external funcs. CM = is64Bit ? CodeModel::Large : CodeModel::Small; - X->InitMCCodeGenInfo(RM, CM, OL); + X->initMCCodeGenInfo(RM, CM, OL); return X; } -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, bool RelaxAll) { - Triple TheTriple(TT); - - switch (TheTriple.getObjectFormat()) { - default: llvm_unreachable("unsupported object format"); - case Triple::MachO: - return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - case Triple::COFF: - assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); - return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); - case Triple::ELF: - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); - } -} - -static MCInstPrinter *createX86MCInstPrinter(const Target &T, +static MCInstPrinter *createX86MCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { + const MCRegisterInfo &MRI) { if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI, MII, MRI, STI); + return new X86ATTInstPrinter(MAI, MII, MRI); if (SyntaxVariant == 1) return new X86IntelInstPrinter(MAI, MII, MRI); return nullptr; @@ -397,61 +236,42 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { // Force static initialization. extern "C" void LLVMInitializeX86TargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo); - RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo); - - // Register the MC codegen info. - RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo); - RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target, - X86_MC::createX86MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target, - X86_MC::createX86MCSubtargetInfo); - - // Register the MC instruction analyzer. - TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target, - createX86MCInstrAnalysis); - TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target, - createX86MCInstrAnalysis); - - // Register the code emitter. - TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target, - createX86MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target, - createX86MCCodeEmitter); + for (Target *T : {&TheX86_32Target, &TheX86_64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter); + + // Register the object streamer. + TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo); + } // Register the asm backend. TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, createX86_32AsmBackend); TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target, - createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target, - createMCStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheX86_32Target, - createX86MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, - createX86MCInstPrinter); - - // Register the MC relocation info. - TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target, - createX86MCRelocationInfo); - TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target, - createX86MCRelocationInfo); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index d8320b9..dcdae1d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -31,10 +31,11 @@ class Target; class Triple; class StringRef; class raw_ostream; +class raw_pwrite_stream; extern Target TheX86_32Target, TheX86_64Target; -/// DWARFFlavour - Flavour of dwarf regnumbers +/// Flavour of dwarf regnumbers /// namespace DWARFFlavour { enum { @@ -42,7 +43,7 @@ namespace DWARFFlavour { }; } -/// N86 namespace - Native X86 register numbers +/// Native X86 register numbers /// namespace N86 { enum { @@ -53,32 +54,18 @@ namespace N86 { namespace X86_MC { std::string ParseX86Triple(StringRef TT); - /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in - /// the specified arguments. If we can't run cpuid on the host, return true. - bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return - /// the 4 values in the specified arguments. If we can't run cpuid on the - /// host, return true. - bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX, - unsigned *rEBX, unsigned *rECX, unsigned *rEDX); - - void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); - unsigned getDwarfRegFlavour(Triple TT, bool isEH); void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); - /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance. - /// This is exposed so Asm parser, etc. do not need to go through - /// TargetRegistry. + /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. + /// do not need to go through TargetRegistry. MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS); } MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MCContext &Ctx); MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, @@ -86,32 +73,30 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef TT, StringRef CPU); -/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code -/// streamer which will generate PE/COFF format object files. +/// Construct an X86 Windows COFF machine code streamer which will generate +/// PE/COFF format object files. /// /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, + raw_pwrite_stream &OS, MCCodeEmitter *CE, bool RelaxAll); -/// createX86MachObjectWriter - Construct an X86 Mach-O object writer. -MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, - bool Is64Bit, +/// Construct an X86 Mach-O object writer. +MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype); -/// createX86ELFObjectWriter - Construct an X86 ELF object writer. -MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS, - bool IsELF64, - uint8_t OSABI, - uint16_t EMachine); -/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer. -MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit); +/// Construct an X86 ELF object writer. +MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64, + uint8_t OSABI, uint16_t EMachine); +/// Construct an X86 Win COFF object writer. +MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit); -/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info. +/// Construct X86-64 Mach-O relocation info. MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); -/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info. +/// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index 3b81d53..6cf5af7 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -36,9 +36,9 @@ public: any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl()); bool isPCRel = Obj->getAnyRelocationPCRel(RE); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName); + MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. - if (Sym->isVariable() == false) + if (!Sym->isVariable()) Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx)); const MCExpr *Expr = nullptr; @@ -92,8 +92,8 @@ public: StringRef RSymName; RSymI->getName(RSymName); - MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName); - if (RSym->isVariable() == false) + MCSymbol *RSym = Ctx.getOrCreateSymbol(RSymName); + if (!RSym->isVariable()) RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx)); const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 67b0c89..9da3e1f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -47,23 +48,21 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter { const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue); - void RecordX86_64Relocation(MachObjectWriter *Writer, - const MCAssembler &Asm, + void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, - const MCFixup &Fixup, - MCValue Target, - uint64_t &FixedValue); + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); + public: X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, /*UseAggressiveSymbolFolding=*/Is64Bit) {} - void RecordRelocation(MachObjectWriter *Writer, - const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, - MCValue Target, uint64_t &FixedValue) override { + void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override { if (Writer->is64Bit()) RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); @@ -97,13 +96,10 @@ static unsigned getFixupKindLog2Size(unsigned Kind) { } } -void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, - const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFragment *Fragment, - const MCFixup &Fixup, - MCValue Target, - uint64_t &FixedValue) { +void X86MachObjectWriter::RecordX86_64Relocation( + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind()); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); @@ -117,6 +113,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, unsigned Index = 0; unsigned IsExtern = 0; unsigned Type = 0; + const MCSymbol *RelSymbol = nullptr; Value = Target.getConstant(); @@ -132,7 +129,6 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (Target.isAbsolute()) { // constant // SymbolNum of 0 indicates the absolute section. Type = MachO::X86_64_RELOC_UNSIGNED; - Index = 0; // FIXME: I believe this is broken, I don't think the linker can understand // it. I think it would require a local relocation, but I'm not sure if that @@ -145,15 +141,15 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, } else if (Target.getSymB()) { // A - B + constant const MCSymbol *A = &Target.getSymA()->getSymbol(); if (A->isTemporary()) - A = &A->AliasedSymbol(); + A = &Writer->findAliasedSymbol(*A); const MCSymbolData &A_SD = Asm.getSymbolData(*A); - const MCSymbolData *A_Base = Asm.getAtom(&A_SD); + const MCSymbol *A_Base = Asm.getAtom(*A); const MCSymbol *B = &Target.getSymB()->getSymbol(); if (B->isTemporary()) - B = &B->AliasedSymbol(); + B = &Writer->findAliasedSymbol(*B); const MCSymbolData &B_SD = Asm.getSymbolData(*B); - const MCSymbolData *B_Base = Asm.getAtom(&B_SD); + const MCSymbol *B_Base = Asm.getAtom(*B); // Neither symbol can be modified. if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || @@ -183,73 +179,64 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // non-relocatable expression. if (A->isUndefined() || B->isUndefined()) { StringRef Name = A->isUndefined() ? A->getName() : B->getName(); - Asm.getContext().FatalError(Fixup.getLoc(), + Asm.getContext().reportFatalError(Fixup.getLoc(), "unsupported relocation with subtraction expression, symbol '" + Name + "' can not be undefined in a subtraction expression"); } - Value += Writer->getSymbolAddress(&A_SD, Layout) - - (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout)); - Value -= Writer->getSymbolAddress(&B_SD, Layout) - - (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout)); + Value += Writer->getSymbolAddress(*A, Layout) - + (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout)); + Value -= Writer->getSymbolAddress(*B, Layout) - + (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout)); - if (A_Base) { - Index = A_Base->getIndex(); - IsExtern = 1; - } else { + if (!A_Base) Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; - } Type = MachO::X86_64_RELOC_UNSIGNED; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); - - if (B_Base) { - Index = B_Base->getIndex(); - IsExtern = 1; - } else { + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); + + if (B_Base) + RelSymbol = B_Base; + else Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; - } Type = MachO::X86_64_RELOC_SUBTRACTOR; } else { const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + if (Symbol->isTemporary() && Value) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Asm.addLocalUsedInReloc(*Symbol); + } const MCSymbolData &SD = Asm.getSymbolData(*Symbol); - const MCSymbolData *Base = Asm.getAtom(&SD); + RelSymbol = Asm.getAtom(*Symbol); // Relocations inside debug sections always use local relocations when // possible. This seems to be done because the debugger doesn't fully // understand x86_64 relocation entries, and expects to find values that // have already been fixed up. if (Symbol->isInSection()) { - const MCSectionMachO &Section = static_cast<const MCSectionMachO&>( - Fragment->getParent()->getSection()); + const MCSectionMachO &Section = + static_cast<const MCSectionMachO &>(*Fragment->getParent()); if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) - Base = nullptr; + RelSymbol = nullptr; } // x86_64 almost always uses external relocations, except when there is no // symbol to use as a base address (a local symbol with no preceding // non-local symbol). - if (Base) { - Index = Base->getIndex(); - IsExtern = 1; - + if (RelSymbol) { // Add the local offset, if needed. - if (Base != &SD) - Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); + if (RelSymbol != Symbol) + Value += Layout.getSymbolOffset(*Symbol) - + Layout.getSymbolOffset(*RelSymbol); } else if (Symbol->isInSection() && !Symbol->isVariable()) { // The index is the section ordinal (1-based). Index = SD.getFragment()->getParent()->getOrdinal() + 1; - IsExtern = 0; - Value += Writer->getSymbolAddress(&SD, Layout); + Value += Writer->getSymbolAddress(*Symbol, Layout); if (IsPCRel) Value -= FixupAddress + (1 << Log2Size); @@ -347,12 +334,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, @@ -377,8 +361,9 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, "' can not be undefined in a subtraction expression", false); - uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); - uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent()); + uint32_t Value = Writer->getSymbolAddress(*A, Layout); + uint64_t SecAddr = + Writer->getSectionAddress(A_SD->getFragment()->getParent()); FixedValue += SecAddr; uint32_t Value2 = 0; @@ -397,7 +382,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, // pedantic compatibility with 'as'. Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF; - Value2 = Writer->getSymbolAddress(B_SD, Layout); + Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); } @@ -409,7 +394,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().FatalError(Fixup.getLoc(), + Asm.getContext().reportFatalError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " @@ -424,7 +409,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, (IsPCRel << 30) | MachO::R_SCATTERED); MRE.r_word1 = Value2; - Writer->addRelocation(Fragment->getParent(), MRE); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); } else { // If the offset is more than 24-bits, it won't fit in a scattered // relocation offset field, so we fall back to using a non-scattered @@ -446,7 +431,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, (IsPCRel << 30) | MachO::R_SCATTERED); MRE.r_word1 = Value; - Writer->addRelocation(Fragment->getParent(), MRE); + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); return true; } @@ -465,10 +450,6 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned IsPCRel = 0; - // Get the symbol data. - const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); - unsigned Index = SD_A->getIndex(); - // We're only going to have a second symbol in pic mode and it'll be a // subtraction from the picbase. For 32-bit pic the addend is the difference // between the picbase and the next address. For 32-bit static the addend is @@ -477,11 +458,11 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, // If this is a subtraction then we're pcrel. uint32_t FixupAddress = Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); - const MCSymbolData *SD_B = - &Asm.getSymbolData(Target.getSymB()->getSymbol()); IsPCRel = 1; - FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) + - Target.getConstant()); + FixedValue = + FixupAddress - + Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) + + Target.getConstant(); FixedValue += 1ULL << Log2Size; } else { FixedValue = 0; @@ -490,12 +471,10 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = Value; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (1 << 27) | // r_extern - (MachO::GENERIC_RELOC_TLV << 28)); // r_type - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28); + Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(), + MRE); } void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, @@ -526,9 +505,9 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, } // Get the symbol data, if any. - const MCSymbolData *SD = nullptr; + const MCSymbol *A = nullptr; if (Target.getSymA()) - SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + A = &Target.getSymA()->getSymbol(); // If this is an internal relocation with an offset, it also needs a scattered // relocation entry. @@ -538,16 +517,16 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // Try to record the scattered relocation if needed. Fall back to non // scattered if necessary (see comments in RecordScatteredRelocation() // for details). - if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) && - RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, - Target, Log2Size, FixedValue)) + if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) && + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + Log2Size, FixedValue)) return; // See <reloc.h>. uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); unsigned Index = 0; - unsigned IsExtern = 0; unsigned Type = 0; + const MCSymbol *RelSymbol = nullptr; if (Target.isAbsolute()) { // constant // SymbolNum of 0 indicates the absolute section. @@ -557,30 +536,28 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, Type = MachO::GENERIC_RELOC_VANILLA; } else { // Resolve constant variables. - if (SD->getSymbol().isVariable()) { + if (A->isVariable()) { int64_t Res; - if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( - Res, Layout, Writer->getSectionAddressMap())) { + if (A->getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { FixedValue = Res; return; } } // Check whether we need an external or internal relocation. - if (Writer->doesSymbolRequireExternRelocation(SD)) { - IsExtern = 1; - Index = SD->getIndex(); + if (Writer->doesSymbolRequireExternRelocation(*A)) { + RelSymbol = A; // For external relocations, make sure to offset the fixup value to // compensate for the addend of the symbol address, if it was // undefined. This occurs with weak definitions, for example. - if (!SD->getSymbol().isUndefined()) - FixedValue -= Layout.getSymbolOffset(SD); + if (!A->isUndefined()) + FixedValue -= Layout.getSymbolOffset(*A); } else { // The index is the section ordinal (1-based). - const MCSectionData &SymSD = Asm.getSectionData( - SD->getSymbol().getSection()); - Index = SymSD.getOrdinal() + 1; - FixedValue += Writer->getSectionAddress(&SymSD); + const MCSection &Sec = A->getSection(); + Index = Sec.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&Sec); } if (IsPCRel) FixedValue -= Writer->getSectionAddress(Fragment->getParent()); @@ -591,17 +568,13 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | - (IsPCRel << 24) | - (Log2Size << 25) | - (IsExtern << 27) | - (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } -MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS, - bool Is64Bit, - uint32_t CPUType, +MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, CPUType, diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 40af822..bd1bc99 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -25,10 +25,11 @@ namespace { class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { public: X86WinCOFFObjectWriter(bool Is64Bit); - virtual ~X86WinCOFFObjectWriter(); + ~X86WinCOFFObjectWriter() override; unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection) const override; + bool IsCrossSection, + const MCAsmBackend &MAB) const override; }; } @@ -40,7 +41,8 @@ X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection) const { + bool IsCrossSection, + const MCAsmBackend &MAB) const { unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? @@ -88,7 +90,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, llvm_unreachable("Unsupported COFF machine type."); } -MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS, +MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) { MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit); return createWinCOFFObjectWriter(MOTW, OS); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 5f1596c..92f42b6 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -18,8 +18,8 @@ class X86WinCOFFStreamer : public MCWinCOFFStreamer { Win64EH::UnwindEmitter EHStreamer; public: X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, - raw_ostream &OS) - : MCWinCOFFStreamer(C, AB, *CE, OS) { } + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, AB, *CE, OS) {} void EmitWinEHHandlerData() override; void EmitWindowsUnwindTables() override; @@ -48,13 +48,11 @@ void X86WinCOFFStreamer::FinishImpl() { } } -namespace llvm { -MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - MCCodeEmitter *CE, raw_ostream &OS, - bool RelaxAll) { +MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + raw_pwrite_stream &OS, + MCCodeEmitter *CE, bool RelaxAll) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); return S; } -} diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index e4c58d4..ef3318b 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -35,7 +35,7 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned CountS = (Imm >> 6) & 3; // CountS selects which input element to use. - unsigned InVal = 4+CountS; + unsigned InVal = 4 + CountS; // CountD specifies which element of destination to update. ShuffleMask[CountD] = InVal; // ZMask zaps values, potentially overriding the CountD elt. @@ -47,20 +47,20 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { // <3,1> or <6,7,2,3> void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(NElts+i); + for (unsigned i = NElts / 2; i != NElts; ++i) + ShuffleMask.push_back(NElts + i); - for (unsigned i = NElts/2; i != NElts; ++i) + for (unsigned i = NElts / 2; i != NElts; ++i) ShuffleMask.push_back(i); } // <0,2> or <0,1,4,5> void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) + for (unsigned i = 0; i != NElts / 2; ++i) ShuffleMask.push_back(i); - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(NElts+i); + for (unsigned i = 0; i != NElts / 2; ++i) + ShuffleMask.push_back(NElts + i); } void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { @@ -79,6 +79,20 @@ void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { } } +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned ScalarSizeInBits = VT.getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + unsigned NumLaneSubElts = 64 / ScalarSizeInBits; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts) + for (unsigned s = 0; s != NumLaneSubElts; s++) + ShuffleMask.push_back(l + s); +} + void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned VectorSizeInBits = VT.getSizeInBits(); unsigned NumElts = VectorSizeInBits / 8; @@ -189,8 +203,8 @@ void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NewImm = Imm; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { // each half of a lane comes from different source - for (unsigned s = 0; s != NumElts*2; s += NumElts) { - for (unsigned i = 0; i != NumLaneElts/2; ++i) { + for (unsigned s = 0; s != NumElts * 2; s += NumElts) { + for (unsigned i = 0; i != NumLaneElts / 2; ++i) { ShuffleMask.push_back(NewImm % NumLaneElts + s + l); NewImm /= NumLaneElts; } @@ -212,9 +226,9 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { unsigned NumLaneElts = NumElts / NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i + NumElts); // Reads from src/src2 } } } @@ -232,9 +246,9 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { unsigned NumLaneElts = NumElts / NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) { - ShuffleMask.push_back(i); // Reads from dest/src1 - ShuffleMask.push_back(i+NumElts); // Reads from src/src2 + for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i + NumElts); // Reads from src/src2 } } } @@ -244,11 +258,11 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, if (Imm & 0x88) return; // Not a shuffle - unsigned HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements() / 2; for (unsigned l = 0; l != 2; ++l) { - unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize; - for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i) + unsigned HalfBegin = ((Imm >> (l * 4)) & 0x3) * HalfSize; + for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i) ShuffleMask.push_back(i); } } @@ -341,7 +355,7 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back((Imm >> (2*i)) & 3); + ShuffleMask.push_back((Imm >> (2 * i)) & 3); } } @@ -385,4 +399,36 @@ void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { } } +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { + unsigned NumDstElts = DstVT.getVectorNumElements(); + unsigned SrcScalarBits = SrcVT.getScalarSizeInBits(); + unsigned DstScalarBits = DstVT.getScalarSizeInBits(); + unsigned Scale = DstScalarBits / SrcScalarBits; + assert(SrcScalarBits < DstScalarBits && + "Expected zero extension mask to increase scalar size"); + assert(SrcVT.getVectorNumElements() >= NumDstElts && + "Too many zero extension lanes"); + + for (unsigned i = 0; i != NumDstElts; i++) { + Mask.push_back(i); + for (unsigned j = 1; j != Scale; j++) + Mask.push_back(SM_SentinelZero); + } +} + +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + ShuffleMask.push_back(0); + for (unsigned i = 1; i < NumElts; i++) + ShuffleMask.push_back(SM_SentinelZero); +} + +void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { + // First element comes from the first element of second source. + // Remaining elements: Load zero extends / Move copies from first source. + unsigned NumElts = VT.getVectorNumElements(); + Mask.push_back(NumElts); + for (unsigned i = 1; i < NumElts; i++) + Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); +} } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 6ba3c64..14b6943 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -40,6 +40,8 @@ void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -88,6 +90,16 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a VPERMILP variable mask from an IR-level vector constant. void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a zero extension instruction as a shuffle mask. +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a move lower and zero upper instruction as a shuffle mask. +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a scalar float move instruction as a shuffle mask. +void DecodeScalarMoveMask(MVT VT, bool IsLoad, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 219b64d..8403ae6 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -55,9 +55,6 @@ FunctionPass *createX86IssueVZeroUpperPass(); /// FunctionPass *createEmitX86CodeToMemory(); -/// \brief Creates an X86-specific Target Transformation Info pass. -ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); - /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); @@ -72,6 +69,17 @@ FunctionPass *createX86FixupLEAs(); /// esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); +/// createX86WinEHStatePass - Return an IR pass that inserts EH registration +/// stack objects and explicit EH state updates. This pass must run after EH +/// preparation, which does Windows-specific but architecture-neutral +/// preparation. +FunctionPass *createX86WinEHStatePass(); + +/// Return a Machine IR pass that expands X86-specific pseudo +/// instructions into a sequence of actual instructions. This pass +/// must run after prologue/epilogue insertion and before lowering +/// the MachineInstr to MC. +FunctionPass *createX86ExpandPseudoPass(); } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 30b3b28..c70e2e9 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -164,14 +164,10 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; -def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", - "Support SGX instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; -def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true", - "Support SMAP instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", @@ -196,6 +192,9 @@ def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", "Use RSQRT* to optimize square root calculations">; def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", "true", "Use RCP* to optimize division calculations">; +def FeatureSoftFloat + : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software floating point features.">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -277,17 +276,15 @@ def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. // "Arrandale" along with corei3 and corei5 -class NehalemProc<string Name, list<SubtargetFeature> AdditionalFeatures> - : ProcessorModel<Name, SandyBridgeModel, !listconcat([ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT - ], - AdditionalFeatures)>; -def : NehalemProc<"nehalem", []>; -def : NehalemProc<"corei7", [FeatureAES]>; +class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureFastUAMem, + FeaturePOPCNT + ]>; +def : NehalemProc<"nehalem">; +def : NehalemProc<"corei7">; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge @@ -372,7 +369,6 @@ class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ FeatureHLE, FeatureADX, FeatureRDSEED, - FeatureSMAP, FeatureSlowIncDec ]>; def : BroadwellProc<"broadwell">; @@ -395,7 +391,7 @@ class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureSGX]>; + FeatureSlowIncDec]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index d06e94f..f97557e 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -47,23 +47,22 @@ using namespace llvm; /// runOnMachineFunction - Emit the function body. /// bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &MF.getSubtarget<X86Subtarget>(); + SMShadowTracker.startFunction(MF); SetupMachineFunction(MF); if (Subtarget->isTargetCOFF()) { bool Intrn = MF.getFunction()->hasInternalLinkage(); - OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC - : COFF::IMAGE_SYM_CLASS_EXTERNAL); - OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT); - OutStreamer.EndCOFFSymbolDef(); + OutStreamer->EndCOFFSymbolDef(); } - // Have common code print out the function header with linkage info etc. - EmitFunctionHeader(); - // Emit the rest of the function body. EmitFunctionBody(); @@ -98,7 +97,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, // Handle dllimport linkage. if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) GVSym = - P.OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { @@ -505,26 +504,27 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { - if (Subtarget->isTargetMachO()) - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + Triple TT(TM.getTargetTriple()); - if (Subtarget->isTargetCOFF()) { + if (TT.isOSBinFormatMachO()) + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + + if (TT.isOSBinFormatCOFF()) { // Emit an absolute @feat.00 symbol. This appears to be some kind of // compiler features bitfield read by link.exe. - if (!Subtarget->is64Bit()) { - MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00")); - OutStreamer.BeginCOFFSymbolDef(S); - OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); - OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); - OutStreamer.EndCOFFSymbolDef(); + if (TT.getArch() == Triple::x86) { + MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00")); + OutStreamer->BeginCOFFSymbolDef(S); + OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); + OutStreamer->EndCOFFSymbolDef(); // According to the PE-COFF spec, the LSB of this value marks the object // for "registered SEH". This means that all SEH handler entry points // must be registered in .sxdata. Use of any unregistered handlers will // cause the process to terminate immediately. LLVM does not know how to // register any SEH handlers, so its object files should be safe. - S->setAbsolute(); - OutStreamer.EmitSymbolAttribute(S, MCSA_Global); - OutStreamer.EmitAssignment( + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + OutStreamer->EmitAssignment( S, MCConstantExpr::Create(int64_t(1), MMI->getContext())); } } @@ -558,14 +558,13 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = - CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout()); + SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); const Constant *C = CPE.Val.ConstVal; if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( getObjFileLowering().getSectionForConstant(Kind, C))) { if (MCSymbol *Sym = S->getCOMDATSymbol()) { if (Sym->isUndefined()) - OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global); return Sym; } } @@ -579,31 +578,34 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { SmallString<128> Directive; raw_svector_ostream OS(Directive); StringRef Name = Sym->getName(); + Triple TT(TM.getTargetTriple()); - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) OS << " /EXPORT:"; else OS << " -export:"; - if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) && + if ((TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) && (Name[0] == getDataLayout().getGlobalPrefix())) Name = Name.drop_front(); OS << Name; if (IsData) { - if (Subtarget->isTargetKnownWindowsMSVC()) + if (TT.isKnownWindowsMSVCEnvironment()) OS << ",DATA"; else OS << ",data"; } OS.flush(); - OutStreamer.EmitBytes(Directive); + OutStreamer->EmitBytes(Directive); } void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetMachO()) { + Triple TT(TM.getTargetTriple()); + + if (TT.isOSBinFormatMachO()) { // All darwin targets use mach-o. MachineModuleInfoMachO &MMIMacho = MMI->getObjFileInfo<MachineModuleInfoMachO>(); @@ -613,58 +615,55 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { Stubs = MMIMacho.GetFnStubList(); if (!Stubs.empty()) { - const MCSection *TheSection = - OutContext.getMachOSection("__IMPORT", "__jump_table", - MachO::S_SYMBOL_STUBS | - MachO::S_ATTR_SELF_MODIFYING_CODE | - MachO::S_ATTR_PURE_INSTRUCTIONS, - 5, SectionKind::getMetadata()); - OutStreamer.SwitchSection(TheSection); + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__jump_table", + MachO::S_SYMBOL_STUBS | MachO::S_ATTR_SELF_MODIFYING_CODE | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 5, SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); for (const auto &Stub : Stubs) { // L_foo$stub: - OutStreamer.EmitLabel(Stub.first); + OutStreamer->EmitLabel(Stub.first); // .indirect_symbol _foo - OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(), - MCSA_IndirectSymbol); + OutStreamer->EmitSymbolAttribute(Stub.second.getPointer(), + MCSA_IndirectSymbol); // hlt; hlt; hlt; hlt; hlt hlt = 0xf4. const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4"; - OutStreamer.EmitBytes(StringRef(HltInsts, 5)); + OutStreamer->EmitBytes(StringRef(HltInsts, 5)); } Stubs.clear(); - OutStreamer.AddBlankLine(); + OutStreamer->AddBlankLine(); } // Output stubs for external and common global variables. Stubs = MMIMacho.GetGVStubList(); if (!Stubs.empty()) { - const MCSection *TheSection = - OutContext.getMachOSection("__IMPORT", "__pointers", - MachO::S_NON_LAZY_SYMBOL_POINTERS, - SectionKind::getMetadata()); - OutStreamer.SwitchSection(TheSection); + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); for (auto &Stub : Stubs) - emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second); + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); Stubs.clear(); - OutStreamer.AddBlankLine(); + OutStreamer->AddBlankLine(); } Stubs = MMIMacho.GetHiddenGVStubList(); if (!Stubs.empty()) { - const MCSection *TheSection = - OutContext.getMachOSection("__IMPORT", "__pointers", - MachO::S_NON_LAZY_SYMBOL_POINTERS, - SectionKind::getMetadata()); - OutStreamer.SwitchSection(TheSection); + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); for (auto &Stub : Stubs) - emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second); + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); Stubs.clear(); - OutStreamer.AddBlankLine(); + OutStreamer->AddBlankLine(); } SM.serializeToStackMapSection(); @@ -674,16 +673,17 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { // implementation of multiple entry points). If this doesn't occur, the // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. - OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } - if (Subtarget->isTargetKnownWindowsMSVC() && MMI->usesVAFloatArgument()) { - StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; - MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); - OutStreamer.EmitSymbolAttribute(S, MCSA_Global); + if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) { + StringRef SymbolName = + (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused"; + MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); } - if (Subtarget->isTargetCOFF()) { + if (TT.isOSBinFormatCOFF()) { // Necessary for dllexport support std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; @@ -710,7 +710,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { const TargetLoweringObjectFileCOFF &TLOFCOFF = static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); - OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection()); + OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection()); for (auto & Symbol : DLLExportedGlobals) GenerateExportDirective(Symbol, /*IsData=*/true); @@ -719,28 +719,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } } - if (Subtarget->isTargetELF()) { - const TargetLoweringObjectFileELF &TLOFELF = - static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); - - MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>(); - - // Output stubs for external and common global variables. - MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); - if (!Stubs.empty()) { - OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); - - for (const auto &Stub : Stubs) { - OutStreamer.EmitLabel(Stub.first); - OutStreamer.EmitSymbolValue(Stub.second.getPointer(), - TD->getPointerSize()); - } - Stubs.clear(); - } - + if (TT.isOSBinFormatELF()) SM.serializeToStackMapSection(); - } } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 748b948..3beeb17 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -57,6 +57,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); private: TargetMachine &TM; + const MachineFunction *MF; std::unique_ptr<MCCodeEmitter> CodeEmitter; bool InShadow; @@ -80,15 +81,15 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void InsertStackMapShadows(MachineFunction &MF); void LowerSTACKMAP(const MachineInstr &MI); - void LowerPATCHPOINT(const MachineInstr &MI); + void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); public: - explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); - } + explicit X86AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this), SMShadowTracker(TM) {} const char *getPassName() const override { return "X86 Assembly / Object Emitter"; @@ -103,7 +104,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void EmitInstruction(const MachineInstr *MI) override; void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { - SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); } bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index fae489e..4412125 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file defines a pass that optimizes call sequences on x86. -// Currently, it converts movs of function parameters onto the stack into +// Currently, it converts movs of function parameters onto the stack into // pushes. This is beneficial for two main reasons: // 1) The push instruction encoding is much smaller than an esp-relative mov // 2) It is possible to push memory arguments directly. So, if the @@ -37,9 +37,10 @@ using namespace llvm; #define DEBUG_TYPE "x86-cf-opt" -cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt", - cl::desc("Avoid optimizing x86 call frames for size"), - cl::init(false), cl::Hidden); +static cl::opt<bool> + NoX86CFOpt("no-x86-call-frame-opt", + cl::desc("Avoid optimizing x86 call frames for size"), + cl::init(false), cl::Hidden); namespace { class X86CallFrameOptimization : public MachineFunctionPass { @@ -49,17 +50,47 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; private: - bool shouldPerformTransformation(MachineFunction &MF); + // Information we know about a particular call site + struct CallContext { + CallContext() + : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; - bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); + // Actuall call instruction + MachineInstr *Call; + + // A copy of the stack pointer + MachineInstr *SPCopy; + + // The total displacement of all passed parameters + int64_t ExpectedDist; + + // The sequence of movs used to pass the parameters + SmallVector<MachineInstr *, 4> MovVector; + + // True if this call site has no stack parameters + bool NoStackParams; + + // True of this callsite can use push instructions + bool UsePush; + }; + + typedef DenseMap<MachineInstr *, CallContext> ContextMap; + + bool isLegal(MachineFunction &MF); + + bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + + void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, CallContext &Context); + + bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I, + const CallContext &Context); MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, unsigned Reg); - const char *getPassName() const override { - return "X86 Optimize Call Frame"; - } + const char *getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; const TargetFrameLowering *TFL; @@ -74,8 +105,10 @@ FunctionPass *llvm::createX86CallFrameOptimization() { return new X86CallFrameOptimization(); } -// This checks whether the transformation is legal and profitable -bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { +// This checks whether the transformation is legal. +// Also returns false in cases where it's potentially legal, but +// we don't even want to try. +bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { if (NoX86CFOpt.getValue()) return false; @@ -84,7 +117,7 @@ bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) // No point in running this in 64-bit mode, since some arguments are // passed in-register in all common calling conventions, so the pattern // we're looking for will never match. - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); if (STI.is64Bit()) return false; @@ -95,8 +128,8 @@ bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) // This is bad, and breaks SP adjustment. // So, check that all of the frames in the function are closed inside // the same block, and, for good measure, that there are no nested frames. - int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); - int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { @@ -104,8 +137,7 @@ bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) if (InsideFrameSequence) return false; InsideFrameSequence = true; - } - else if (MI.getOpcode() == FrameDestroyOpcode) { + } else if (MI.getOpcode() == FrameDestroyOpcode) { if (!InsideFrameSequence) return false; InsideFrameSequence = false; @@ -116,99 +148,141 @@ bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) return false; } - // Now that we know the transformation is legal, check if it is - // profitable. - // TODO: Add a heuristic that actually looks at the function, - // and enable this for more cases. + return true; +} - // This transformation is always a win when we expected to have - // a reserved call frame. Under other circumstances, it may be either +// Check whether this trasnformation is profitable for a particular +// function - in terms of code size. +bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, + ContextMap &CallSeqMap) { + // This transformation is always a win when we do not expect to have + // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. - // For now, enable it only for the relatively clear win cases. bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); if (CannotReserveFrame) return true; - // For now, don't even try to evaluate the profitability when - // not optimizing for size. - AttributeSet FnAttrs = MF.getFunction()->getAttributes(); + // Don't do this when not optimizing for size. bool OptForSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || + MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (!OptForSize) return false; - // Stack re-alignment can make this unprofitable even in terms of size. - // As mentioned above, a better heuristic is needed. For now, don't do this - // when the required alignment is above 8. (4 would be the safe choice, but - // some experimentation showed 8 is generally good). - if (TFL->getStackAlignment() > 8) - return false; - return true; + unsigned StackAlign = TFL->getStackAlignment(); + + int64_t Advantage = 0; + for (auto CC : CallSeqMap) { + // Call sites where no parameters are passed on the stack + // do not affect the cost, since there needs to be no + // stack adjustment. + if (CC.second.NoStackParams) + continue; + + if (!CC.second.UsePush) { + // If we don't use pushes for a particular call site, + // we pay for not having a reserved call frame with an + // additional sub/add esp pair. The cost is ~3 bytes per instruction, + // depending on the size of the constant. + // TODO: Callee-pop functions should have a smaller penalty, because + // an add is needed even with a reserved call frame. + Advantage -= 6; + } else { + // We can use pushes. First, account for the fixed costs. + // We'll need a add after the call. + Advantage -= 3; + // If we have to realign the stack, we'll also need and sub before + if (CC.second.ExpectedDist % StackAlign) + Advantage -= 3; + // Now, for each push, we save ~3 bytes. For small constants, we actually, + // save more (up to 5 bytes), but 3 should be a good approximation. + Advantage += (CC.second.ExpectedDist / 4) * 3; + } + } + + return (Advantage >= 0); } + bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TFL = MF.getSubtarget().getFrameLowering(); MRI = &MF.getRegInfo(); - if (!shouldPerformTransformation(MF)) + if (!isLegal(MF)) return false; - int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); bool Changed = false; + ContextMap CallSeqMap; + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) - if (I->getOpcode() == FrameSetupOpcode) - Changed |= adjustCallSequence(MF, *BB, I); + if (I->getOpcode() == FrameSetupOpcode) { + CallContext &Context = CallSeqMap[I]; + collectCallInfo(MF, *BB, I, Context); + } + + if (!isProfitable(MF, CallSeqMap)) + return false; + + for (auto CC : CallSeqMap) + if (CC.second.UsePush) + Changed |= adjustCallSequence(MF, CC.first, CC.second); return Changed; } -bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - +void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + CallContext &Context) { // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); - int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; + // How much do we adjust the stack? This puts an upper bound on + // the number of parameters actually passed on it. + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + // A zero adjustment means no stack parameters + if (!MaxAdjust) { + Context.NoStackParams = true; + return; + } + // For globals in PIC mode, we can have some LEAs here. // Ignore them, they don't bother us. // TODO: Extend this to something that covers more cases. while (I->getOpcode() == X86::LEA32r) ++I; - + // We expect a copy instruction here. // TODO: The copy instruction is a lowering artifact. // We should also support a copy-less version, where the stack // pointer is used directly. if (!I->isCopy() || !I->getOperand(0).isReg()) - return false; - MachineBasicBlock::iterator SPCopy = I++; - StackPtr = SPCopy->getOperand(0).getReg(); + return; + Context.SPCopy = I++; + StackPtr = Context.SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr // instructions, that push a sequence of 32-bit values onto the stack, with // no gaps between them. - SmallVector<MachineInstr*, 4> MovVector(4, nullptr); - unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; if (MaxAdjust > 4) - MovVector.resize(MaxAdjust, nullptr); + Context.MovVector.resize(MaxAdjust, nullptr); do { int Opcode = I->getOpcode(); @@ -230,77 +304,86 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || !I->getOperand(X86::AddrDisp).isImm()) - return false; + return; int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); - assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); + assert(StackDisp >= 0 && + "Negative stack displacement when passing parameters"); // We really don't want to consider the unaligned case. if (StackDisp % 4) - return false; + return; StackDisp /= 4; - assert((size_t)StackDisp < MovVector.size() && - "Function call has more parameters than the stack is adjusted for."); + assert((size_t)StackDisp < Context.MovVector.size() && + "Function call has more parameters than the stack is adjusted for."); // If the same stack slot is being filled twice, something's fishy. - if (MovVector[StackDisp] != nullptr) - return false; - MovVector[StackDisp] = I; + if (Context.MovVector[StackDisp] != nullptr) + return; + Context.MovVector[StackDisp] = I; ++I; } while (I != MBB.end()); // We now expect the end of the sequence - a call and a stack adjust. if (I == MBB.end()) - return false; + return; // For PCrel calls, we expect an additional COPY of the basereg. // If we find one, skip it. if (I->isCopy()) { if (I->getOperand(1).getReg() == - MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) + MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) ++I; else - return false; + return; } if (!I->isCall()) - return false; - MachineBasicBlock::iterator Call = I; + return; + + Context.Call = I; if ((++I)->getOpcode() != FrameDestroyOpcode) - return false; + return; // Now, go through the vector, and see that we don't have any gaps, // but only a series of 32-bit MOVs. - - int64_t ExpectedDist = 0; - auto MMI = MovVector.begin(), MME = MovVector.end(); - for (; MMI != MME; ++MMI, ExpectedDist += 4) + auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end(); + for (; MMI != MME; ++MMI, Context.ExpectedDist += 4) if (*MMI == nullptr) break; - + // If the call had no parameters, do nothing - if (!ExpectedDist) - return false; + if (MMI == Context.MovVector.begin()) + return; - // We are either at the last parameter, or a gap. + // We are either at the last parameter, or a gap. // Make sure it's not a gap for (; MMI != MME; ++MMI) if (*MMI != nullptr) - return false; + return; + + Context.UsePush = true; + return; +} +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, + MachineBasicBlock::iterator I, + const CallContext &Context) { // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. - FrameSetup->getOperand(1).setImm(ExpectedDist); + MachineBasicBlock::iterator FrameSetup = I; + MachineBasicBlock &MBB = *(I->getParent()); + FrameSetup->getOperand(1).setImm(Context.ExpectedDist); DebugLoc DL = I->getDebugLoc(); // Now, iterate through the vector in reverse order, and replace the movs - // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. - for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { - MachineBasicBlock::iterator MOV = *MovVector[Idx]; + for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { + MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); if (MOV->getOpcode() == X86::MOV32mi) { unsigned PushOpcode = X86::PUSHi32; @@ -313,20 +396,21 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, if (isInt<8>(Val)) PushOpcode = X86::PUSH32i8; } - BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp); } else { unsigned int Reg = PushOp.getReg(); // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. - const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { - MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); + MachineInstr *Push = + BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -334,7 +418,9 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, DefMov->eraseFromParent(); } else { - BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); + BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + .addReg(Reg) + .getInstr(); } } @@ -343,8 +429,8 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // The stack-pointer copy is no longer used in the call sequences. // There should not be any other users, but we can't commit to that, so: - if (MRI->use_empty(SPCopy->getOperand(0).getReg())) - SPCopy->eraseFromParent(); + if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg())) + Context.SPCopy->eraseFromParent(); // Once we've done this, we need to make sure PEI doesn't assume a reserved // frame. @@ -381,17 +467,11 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( DefMI->getParent() != FrameSetup->getParent()) return nullptr; - // Be careful with movs that load from a stack slot, since it may get - // resolved incorrectly. - // TODO: Again, we already have the infrastructure, so this should work. - if (!DefMI->getOperand(1).isReg()) - return nullptr; - // Now, make sure everything else up until the ADJCALLSTACK is a sequence // of MOVs. To be less conservative would require duplicating a lot of the // logic from PeepholeOptimizer. // FIXME: A possibly better approach would be to teach the PeepholeOptimizer - // to be smarter about folding into pushes. + // to be smarter about folding into pushes. for (auto I = DefMI; I != FrameSetup; ++I) if (I->getOpcode() != X86::MOV32rm) return nullptr; diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 75a2ec0..8f88888 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -34,11 +34,22 @@ def RetCC_X86Common : CallingConv<[ // // For code that doesn't care about the ABI, we allow returning more than two // integer values in registers. + CCIfType<[i1], CCPromoteToType<i8>>, CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, + // Boolean vectors of AVX-512 are returned in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. @@ -241,8 +252,8 @@ def CC_X86_64_C : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<8, 8>>, - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, @@ -258,6 +269,16 @@ def CC_X86_64_C : CallingConv<[ CCIfSubtarget<"hasSSE2()", CCPromoteToType<v2i64>>>>, + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + // The first 8 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", @@ -303,8 +324,8 @@ def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. // FIXME: Handle varargs. - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCAssignToReg<[R10]>>, @@ -424,9 +445,61 @@ def CC_X86_64_AnyReg : CallingConv<[ // X86 C Calling Convention //===----------------------------------------------------------------------===// +/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector +/// values are spilled on the stack. +def CC_X86_32_Vector_Common : CallingConv<[ + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in +// vector registers +def CC_X86_32_Vector_Standard : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasFp256()", + CCAssignToReg<[YMM0, YMM1, YMM2]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + +// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in +// vector registers. +def CC_X86_32_Vector_Darwin : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasFp256()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP -/// values are spilled on the stack, and the first 4 vector values go in XMM -/// regs. +/// values are spilled on the stack. def CC_X86_32_Common : CallingConv<[ // Handles byval parameters. CCIfByVal<CCPassByVal<4, 4>>, @@ -452,29 +525,30 @@ def CC_X86_32_Common : CallingConv<[ // Long doubles get slots whose size depends on the subtarget. CCIfType<[f80], CCAssignToStack<0, 4>>, - // The first 4 SSE vector arguments are passed in XMM registers. - CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, - - // The first 4 AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCIfSubtarget<"hasFp256()", - CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, - - // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, - - // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - CCAssignToStack<32, 32>>, + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are // passed in the parameter area. - CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // Darwin passes vectors in a form that differs from the i386 psABI + CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>, + + // Otherwise, drop to 'normal' X86-32 CC + CCDelegateTo<CC_X86_32_Vector_Standard> +]>; def CC_X86_32_C : CallingConv<[ - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, @@ -488,8 +562,8 @@ def CC_X86_32_C : CallingConv<[ ]>; def CC_X86_32_FastCall : CallingConv<[ - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, @@ -534,15 +608,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[ ]>; def CC_X86_32_ThisCall_Mingw : CallingConv<[ - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, CCDelegateTo<CC_X86_32_ThisCall_Common> ]>; def CC_X86_32_ThisCall_Win : CallingConv<[ - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // Pass sret arguments indirectly through stack. CCIfSRet<CCAssignToStack<4, 4>>, @@ -561,8 +635,8 @@ def CC_X86_32_FastCC : CallingConv<[ // puts arguments in registers. CCIfByVal<CCPassByVal<4, 4>>, - // Promote i8/i16 arguments to i32. - CCIfType<[i8, i16], CCPromoteToType<i32>>, + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in EAX. CCIfNest<CCAssignToReg<[EAX]>>, @@ -626,6 +700,9 @@ def CC_Intel_OCL_BI : CallingConv<[ CCIfType<[v16f32, v8f64, v16i32, v8i64], CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + // Pass masks in mask registers + CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, CCDelegateTo<CC_X86_32_C> diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp new file mode 100644 index 0000000..1b00997 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -0,0 +1,189 @@ +//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling, if-conversion, other late +// optimizations, or simply the encoding of the instructions. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/GlobalValue.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-pseudo" + +namespace { +class X86ExpandPseudo : public MachineFunctionPass { +public: + static char ID; + X86ExpandPseudo() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const X86Subtarget *STI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + const X86FrameLowering *X86FL; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "X86 pseudo instruction expansion pass"; + } + +private: + bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool ExpandMBB(MachineBasicBlock &MBB); +}; +char X86ExpandPseudo::ID = 0; +} // End anonymous namespace. + +/// If \p MBBI is a pseudo instruction, this method expands +/// it to the corresponding (sequence of) actual instruction(s). +/// \returns true if \p MBBI has been expanded. +bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + DebugLoc DL = MBBI->getDebugLoc(); + switch (Opcode) { + default: + return false; + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: { + bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; + MachineOperand &JumpTarget = MBBI->getOperand(0); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + + if (StackAdj) { + bool Is64Bit = STI->is64Bit(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = + STI->isTarget64BitLP64() || STI->isTargetNaCl64(); + // Check if we should use LEA for SP. + bool UseLEAForSP = STI->useLeaForSP() && + X86FL->canUseLEAForSPInEpilogue(*MBB.getParent()); + unsigned StackPtr = TRI->getStackRegister(); + // Check for possible merge with preceding ADD instruction. + StackAdj += X86FrameLowering::mergeSPUpdates(MBB, MBBI, StackPtr, true); + X86FrameLowering::emitSPUpdate(MBB, MBBI, StackPtr, StackAdj, Is64Bit, + Uses64BitFramePtr, UseLEAForSP, *TII, + *TRI); + } + + // Jump to label or value in register. + bool IsWin64 = STI->isTargetWin64(); + if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) { + unsigned Op = (Opcode == X86::TCRETURNdi) + ? X86::TAILJMPd + : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) { + unsigned Op = (Opcode == X86::TCRETURNmi) + ? X86::TAILJMPm + : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); + for (unsigned i = 0; i != 5; ++i) + MIB.addOperand(MBBI->getOperand(i)); + } else if (Opcode == X86::TCRETURNri64) { + BuildMI(MBB, MBBI, DL, + TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } else { + BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + NewMI->copyImplicitOps(*MBBI->getParent()->getParent(), MBBI); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + + return true; + } + case X86::EH_RETURN: + case X86::EH_RETURN64: { + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + const bool Uses64BitFramePtr = + STI->isTarget64BitLP64() || STI->isTargetNaCl64(); + unsigned StackPtr = TRI->getStackRegister(); + BuildMI(MBB, MBBI, DL, + TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) + .addReg(DestAddr.getReg()); + // The EH_RETURN pseudo is really removed during the MC Lowering. + return true; + } + } + llvm_unreachable("Previous switch has a fallthrough?"); +} + +/// Expand all pseudo instructions contained in \p MBB. +/// \returns true if any expansion occurred for \p MBB. +bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + // MBBI may be invalidated by the expansion. + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= ExpandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + STI = &static_cast<const X86Subtarget &>(MF.getSubtarget()); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + X86FL = STI->getFrameLowering(); + + bool Modified = false; + for (MachineBasicBlock &MBB : MF) + Modified |= ExpandMBB(MBB); + return Modified; +} + +/// Returns an instance of the pseudo instruction expansion pass. +FunctionPass *llvm::createX86ExpandPseudoPass() { + return new X86ExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index 688a544..9af0aeb 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -58,8 +59,8 @@ class X86FastISel final : public FastISel { public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) - : FastISel(funcInfo, libInfo) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); + : FastISel(funcInfo, libInfo) { + Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); } @@ -80,15 +81,15 @@ public: #include "X86GenFastISel.inc" private: - bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); - bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, - unsigned &ResultReg); + bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg, unsigned Alignment = 1); - bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, + bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, - const X86AddressMode &AM, + X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, @@ -123,11 +124,15 @@ private: bool X86SelectTrunc(const Instruction *I); + bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc, + const TargetRegisterClass *RC); + bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); + bool X86SelectSIToFP(const Instruction *I); const X86InstrInfo *getInstrInfo() const { - return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); + return Subtarget->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast<const X86TargetMachine *>(&TM); @@ -160,6 +165,9 @@ private: bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, const Value *Cond); + + const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, + X86AddressMode &AM); }; } // end anonymous namespace. @@ -237,6 +245,20 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) { return std::make_pair(CC, NeedSwap); } +/// \brief Adds a complex addressing mode to the given machine instr builder. +/// Note, this will constrain the index register. If its not possible to +/// constrain the given index register, then a new one will be created. The +/// IndexReg field of the addressing mode will be updated to match in this case. +const MachineInstrBuilder & +X86FastISel::addFullAddress(const MachineInstrBuilder &MIB, + X86AddressMode &AM) { + // First constrain the index register. It needs to be a GR64_NOSP. + AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg, + MIB->getNumOperands() + + X86::AddrIndexReg); + return ::addFullAddress(MIB, AM); +} + /// \brief Check if it is possible to fold the condition from the XALU intrinsic /// into the user. The condition code will only be updated on success. bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, @@ -321,8 +343,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. -bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, - MachineMemOperand *MMO, unsigned &ResultReg) { +bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, + MachineMemOperand *MMO, unsigned &ResultReg, + unsigned Alignment) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -367,6 +390,30 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, case MVT::f80: // No f80 support yet. return false; + case MVT::v4f32: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm; + RC = &X86::VR128RegClass; + break; + case MVT::v2f64: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm; + RC = &X86::VR128RegClass; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm; + RC = &X86::VR128RegClass; + break; } ResultReg = createResultReg(RC); @@ -383,7 +430,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, - const X86AddressMode &AM, + X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; @@ -444,7 +491,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM, + X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) @@ -1063,8 +1110,14 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86SelectAddress(Ptr, AM)) return false; + unsigned Alignment = LI->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + unsigned ResultReg = 0; - if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, + Alignment)) return false; updateValueMap(I, ResultReg); @@ -1089,27 +1142,37 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { } } -/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS -/// of the comparison, return an opcode that works for the compare (e.g. -/// CMP32ri) otherwise return 0. +/// If we have a comparison with RHS as the RHS of the comparison, return an +/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + int64_t Val = RHSC->getSExtValue(); switch (VT.getSimpleVT().SimpleTy) { // Otherwise, we can't fold the immediate into this comparison. - default: return 0; - case MVT::i8: return X86::CMP8ri; - case MVT::i16: return X86::CMP16ri; - case MVT::i32: return X86::CMP32ri; + default: + return 0; + case MVT::i8: + return X86::CMP8ri; + case MVT::i16: + if (isInt<8>(Val)) + return X86::CMP16ri8; + return X86::CMP16ri; + case MVT::i32: + if (isInt<8>(Val)) + return X86::CMP32ri8; + return X86::CMP32ri; case MVT::i64: + if (isInt<8>(Val)) + return X86::CMP64ri8; // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext // field. - if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + if (isInt<32>(Val)) return X86::CMP64ri32; return 0; } } bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, - EVT VT) { + EVT VT, DebugLoc CurDbgLoc) { unsigned Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; @@ -1122,7 +1185,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, // CMPri, otherwise use CMPrr. if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) .addReg(Op0Reg) .addImm(Op1C->getSExtValue()); return true; @@ -1134,7 +1197,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, unsigned Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) .addReg(Op0Reg) .addReg(Op1Reg); @@ -1202,7 +1265,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { ResultReg = createResultReg(&X86::GR8RegClass); if (SETFOpc) { - if (!X86FastEmitCompare(LHS, RHS, VT)) + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); @@ -1227,7 +1290,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { std::swap(LHS, RHS); // Emit a compare of LHS/RHS. - if (!X86FastEmitCompare(LHS, RHS, VT)) + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -1353,7 +1416,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) @@ -1740,7 +1803,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { EVT CmpVT = TLI.getValueType(CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. - if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; if (SETFOpc) { @@ -1805,11 +1868,11 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return true; } -/// \brief Emit SSE instructions to lower the select. +/// \brief Emit SSE or AVX instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary -/// SSE instructions are available. +/// SSE instructions are available. If AVX is available, try to use a VBLENDV. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have @@ -1845,19 +1908,17 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (NeedSwap) std::swap(CmpLHS, CmpRHS); - static unsigned OpcTable[2][2][4] = { - { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, - { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, - { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, - { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + // Choose the SSE instruction sequence based on data type (float or double). + static unsigned OpcTable[2][4] = { + { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } }; - bool HasAVX = Subtarget->hasAVX(); unsigned *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; - case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; - case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; } const Value *LHS = I->getOperand(1); @@ -1879,14 +1940,33 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, - CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, - LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, - RHSReg, RHSIsKill); - unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, - AndReg, /*IsKill=*/true); + unsigned ResultReg; + + if (Subtarget->hasAVX()) { + // If we have AVX, create 1 blendv instead of 3 logic instructions. + // Blendv was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + unsigned CmpOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + unsigned BlendOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + } else { + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + } updateValueMap(I, ResultReg); return true; } @@ -1924,7 +2004,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(CmpLHS->getType()); - if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; } else { unsigned CondReg = getRegForValue(Cond); @@ -2001,41 +2081,84 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { return false; } +bool X86FastISel::X86SelectSIToFP(const Instruction *I) { + // The target-independent selection algorithm in FastISel already knows how + // to select a SINT_TO_FP if the target is SSE but not AVX. + // Early exit if the subtarget doesn't have AVX. + if (!Subtarget->hasAVX()) + return false; + + if (!I->getOperand(0)->getType()->isIntegerTy(32)) + return false; + + // Select integer to float/double conversion. + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + const TargetRegisterClass *RC = nullptr; + unsigned Opcode; + + if (I->getType()->isDoubleTy()) { + // sitofp int -> double + Opcode = X86::VCVTSI2SDrr; + RC = &X86::FR64RegClass; + } else if (I->getType()->isFloatTy()) { + // sitofp int -> float + Opcode = X86::VCVTSI2SSrr; + RC = &X86::FR32RegClass; + } else + return false; + + unsigned ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + unsigned ResultReg = + fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); + updateValueMap(I, ResultReg); + return true; +} + +// Helper method used by X86SelectFPExt and X86SelectFPTrunc. +bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, + unsigned TargetOpc, + const TargetRegisterClass *RC) { + assert((I->getOpcode() == Instruction::FPExt || + I->getOpcode() == Instruction::FPTrunc) && + "Instruction must be an FPExt or FPTrunc!"); + + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), + ResultReg); + if (Subtarget->hasAVX()) + MIB.addReg(OpReg); + MIB.addReg(OpReg); + updateValueMap(I, ResultReg); + return true; +} + bool X86FastISel::X86SelectFPExt(const Instruction *I) { - // fpext from float to double. - if (X86ScalarSSEf64 && - I->getType()->isDoubleTy()) { - const Value *V = I->getOperand(0); - if (V->getType()->isFloatTy()) { - unsigned OpReg = getRegForValue(V); - if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(&X86::FR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::CVTSS2SDrr), ResultReg) - .addReg(OpReg); - updateValueMap(I, ResultReg); - return true; - } + if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && + I->getOperand(0)->getType()->isFloatTy()) { + // fpext from float to double. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass); } return false; } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { - if (X86ScalarSSEf64) { - if (I->getType()->isFloatTy()) { - const Value *V = I->getOperand(0); - if (V->getType()->isDoubleTy()) { - unsigned OpReg = getRegForValue(V); - if (OpReg == 0) return false; - unsigned ResultReg = createResultReg(&X86::FR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::CVTSD2SSrr), ResultReg) - .addReg(OpReg); - updateValueMap(I, ResultReg); - return true; - } - } + if (X86ScalarSSEf64 && I->getType()->isFloatTy() && + I->getOperand(0)->getType()->isDoubleTy()) { + // fptrunc from double to float. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass); } return false; @@ -2062,6 +2185,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return true; } + bool KillInputReg = false; if (!Subtarget->is64Bit()) { // If we're on x86-32; we can't extract an i8 from a general register. // First issue a copy to GR16_ABCD or GR32_ABCD. @@ -2071,11 +2195,12 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); InputReg = CopyReg; + KillInputReg = true; } // Issue an extract_subreg. unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, - InputReg, /*Kill=*/true, + InputReg, KillInputReg, X86::sub_8bit); if (!ResultReg) return false; @@ -2127,7 +2252,73 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. switch (II->getIntrinsicID()) { default: return false; + case Intrinsic::convert_from_fp16: + case Intrinsic::convert_to_fp16: { + if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) + return false; + + const Value *Op = II->getArgOperand(0); + unsigned InputReg = getRegForValue(Op); + if (InputReg == 0) + return false; + + // F16C only allows converting from float to half and from half to float. + bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16; + if (IsFloatToHalf) { + if (!Op->getType()->isFloatTy()) + return false; + } else { + if (!II->getType()->isFloatTy()) + return false; + } + + unsigned ResultReg = 0; + const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16); + if (IsFloatToHalf) { + // 'InputReg' is implicitly promoted from register class FR32 to + // register class VR128 by method 'constrainOperandRegClass' which is + // directly called by 'fastEmitInst_ri'. + // Instruction VCVTPS2PHrr takes an extra immediate operand which is + // used to provide rounding control. + InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0); + + // Move the lower 32-bits of ResultReg to another register of class GR32. + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::VMOVPDI2DIrr), ResultReg) + .addReg(InputReg, RegState::Kill); + + // The result value is in the lower 16-bits of ResultReg. + unsigned RegIdx = X86::sub_16bit; + ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); + } else { + assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); + // Explicitly sign-extend the input to 32-bit. + InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, + /*Kill=*/false); + + // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. + InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, + InputReg, /*Kill=*/true); + + InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); + + // The result value is in the lower 32-bits of ResultReg. + // Emit an explicit copy from register class VR128 to register class FR32. + ResultReg = createResultReg(&X86::FR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(InputReg, RegState::Kill); + } + + updateValueMap(II, ResultReg); + return true; + } case Intrinsic::frameaddress: { + MachineFunction *MF = FuncInfo.MF; + if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI()) + return false; + Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; @@ -2145,12 +2336,11 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // This needs to be set before we call getPtrSizedFrameRegister, otherwise // we get the wrong frame register. - MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MachineFrameInfo *MFI = MF->getFrameInfo(); MFI->setFrameAddressIsTaken(true); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF)); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -2247,6 +2437,8 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); // FIXME may need to add RegState::Debug to any registers produced, // although ESP/EBP should be the only ones at the moment. + assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) && + "Expected inlined-at fields to agree"); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) .addImm(0) .addMetadata(DI->getVariable()) @@ -2738,8 +2930,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { .addImm(NumBytes).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign const &VA = ArgLocs[i]; const Value *ArgVal = OutVals[VA.getValNo()]; @@ -2870,7 +3061,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), @@ -2934,7 +3125,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CC)); + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) @@ -3044,6 +3235,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); + case Instruction::SIToFP: + return X86SelectSIToFP(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); @@ -3189,8 +3382,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align); + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + TM.getDataLayout()->getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } @@ -3224,7 +3417,10 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { ResultReg) .addGlobalAddress(GV); } else { - unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); } @@ -3266,7 +3462,10 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; - unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3337,6 +3536,23 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, if (!Result) return false; + // The index register could be in the wrong register class. Unfortunately, + // foldMemoryOperandImpl could have commuted the instruction so its not enough + // to just look at OpNo + the offset to the index reg. We actually need to + // scan the instruction to find the index reg and see if its the correct reg + // class. + for (MIOperands MO(Result); MO.isValid(); ++MO) { + if (!MO->isReg() || MO->isDef() || MO->getReg() != AM.IndexReg) + continue; + // Found the index reg, now try to rewrite it. + unsigned OpNo = MO.getOperandNo(); + unsigned IndexReg = constrainOperandRegClass(Result->getDesc(), + MO->getReg(), OpNo); + if (IndexReg == MO->getReg()) + continue; + MO->setReg(IndexReg); + } + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index 8e033a0..b39c5ab 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -88,7 +88,6 @@ public: private: MachineFunction *MF; - const TargetMachine *TM; const X86InstrInfo *TII; // Machine instruction info. }; char FixupLEAPass::ID = 0; @@ -150,13 +149,11 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; - TM = &Func.getTarget(); - const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = - static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo()); + TII = ST.getInstrInfo(); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -219,7 +216,7 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return CurInst; } InstrDistance += TII->getInstrLatency( - TM->getSubtargetImpl()->getInstrItineraryData(), CurInst); + MF->getSubtarget().getInstrItineraryData(), CurInst); Found = getPreviousInstr(CurInst, MFI); } return nullptr; @@ -333,7 +330,7 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (TM->getSubtarget<X86Subtarget>().isSLM()) + if (MF.getSubtarget<X86Subtarget>().isSLM()) processInstructionForSLM(I, MFI); else processInstruction(I, MFI); diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 6189109..3b0bd03 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -32,10 +32,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" @@ -300,7 +300,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // function. If it is all integer, there is nothing for us to do! bool FPIsUsed = false; - assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); for (unsigned i = 0; i <= 6; ++i) if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { FPIsUsed = true; @@ -438,7 +438,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // Rewind to first instruction newly inserted. while (Start != BB.begin() && std::prev(Start) != PrevI) --Start; dbgs() << "Inserted instructions:\n\t"; - Start->print(dbgs(), &MF.getTarget()); + Start->print(dbgs()); while (++Start != std::next(I)) {} } dumpStack(); @@ -898,7 +898,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Now we should have the correct registers live. DEBUG(dumpStack()); - assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch"); + assert(StackTop == countPopulation(Mask) && "Live count mismatch"); } /// shuffleStackTop - emit fxch instructions before I to shuffle the top @@ -943,7 +943,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { } } - unsigned N = CountTrailingOnes_32(STReturns); + unsigned N = countTrailingOnes(STReturns); // FP registers used for function return must be consecutive starting at // FP0. @@ -1420,14 +1420,14 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { if (STUses && !isMask_32(STUses)) MI->emitError("fixed input regs must be last on the x87 stack"); - unsigned NumSTUses = CountTrailingOnes_32(STUses); + unsigned NumSTUses = countTrailingOnes(STUses); // Defs must be contiguous from the stack top. ST0-STn. if (STDefs && !isMask_32(STDefs)) { MI->emitError("output regs must be last on the x87 stack"); STDefs = NextPowerOf2(STDefs) - 1; } - unsigned NumSTDefs = CountTrailingOnes_32(STDefs); + unsigned NumSTDefs = countTrailingOnes(STDefs); // So must the clobbered stack slots. ST0-STm, m >= n. if (STClobbers && !isMask_32(STDefs | STClobbers)) @@ -1437,7 +1437,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { unsigned STPopped = STUses & (STDefs | STClobbers); if (STPopped && !isMask_32(STPopped)) MI->emitError("implicitly popped regs must be last on the x87 stack"); - unsigned NumSTPopped = CountTrailingOnes_32(STPopped); + unsigned NumSTPopped = countTrailingOnes(STPopped); DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 66f54e9..db58d9c 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -109,6 +109,14 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } +static unsigned getSUBrrOpcode(unsigned isLP64) { + return isLP64 ? X86::SUB64rr : X86::SUB32rr; +} + +static unsigned getADDrrOpcode(unsigned isLP64) { + return isLP64 ? X86::ADD64rr : X86::ADD32rr; +} + static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) @@ -182,14 +190,27 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, return 0; } +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. -static -void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, int64_t NumBytes, - bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, - const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { +void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, int64_t NumBytes, + bool Is64BitTarget, bool Is64BitStackPtr, + bool UseLEA, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc; @@ -204,7 +225,33 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { - uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + + if (isSub && !isEAXLiveIn(*MBB.getParent())) + Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + + if (Reg) { + Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; + BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) + .addImm(Offset); + Opc = isSub + ? getSUBrrOpcode(Is64BitTarget) + : getADDrrOpcode(Is64BitTarget); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset = 0; + continue; + } + } + + uint64_t ThisVal = std::min(Offset, Chunk); if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub @@ -266,45 +313,10 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } -/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower -/// iterator. -static -void mergeSPUpdatesDown(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - // FIXME: THIS ISN'T RUN!!! - return; - - if (MBBI == MBB.end()) return; - - MachineBasicBlock::iterator NI = std::next(MBBI); - if (NI == MBB.end()) return; - - unsigned Opc = NI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } -} - -/// mergeSPUpdates - Checks the instruction before/after the passed -/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and -/// the stack adjustment is returned as a positive value for ADD/LEA and a -/// negative for SUB. -static int mergeSPUpdates(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, unsigned StackPtr, - bool doMergeWithPrevious) { +int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, + bool doMergeWithPrevious) { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; @@ -333,19 +345,6 @@ static int mergeSPUpdates(MachineBasicBlock &MBB, return Offset; } -static bool isEAXLiveIn(MachineFunction &MF) { - for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), - EE = MF.getRegInfo().livein_end(); II != EE; ++II) { - unsigned Reg = II->first; - - if (Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) - return true; - } - - return false; -} - void X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -396,12 +395,10 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); bool Is64Bit = STI.is64Bit(); bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); unsigned CallOp; if (Is64Bit) @@ -453,6 +450,35 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, } } +static unsigned calculateSetFPREG(uint64_t SPAdjust) { + // Win64 ABI has a less restrictive limitation of 240; 128 works equally well + // and might require smaller successive adjustments. + const uint64_t Win64MaxSEHOffset = 128; + uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); + // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. + return SEHFrameOffset & -16; +} + +// If we're forcing a stack realignment we can't rely on just the frame +// info, we need to know the ABI stack alignment as well in case we +// have a call out. Otherwise just make sure we have some alignment - we'll +// go with the minimum SlotSize. +static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned StackAlign = STI.getFrameLowering()->getStackAlignment(); + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + return MaxAlign; +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -537,52 +563,44 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - for 32-bit code, substitute %e?? registers for %r?? */ -void X86FrameLowering::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. +void X86FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); - bool IsWin64 = STI.isTargetWin64(); + bool IsWin64 = STI.isCallingConvWin64(Fn->getCallingConv()); // Not necessarily synonymous with IsWin64. bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); - const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? - getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + const unsigned MachineFramePtr = + STI.isTarget64BitILP32() + ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum SlotSize. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else if (MaxAlign < SlotSize) - MaxAlign = SlotSize; - } - // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta && IsWinEH) + report_fatal_error("Can't handle guaranteed tail call under win64 yet"); + if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); @@ -602,14 +620,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoRedZone) && + if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone - !usesTheStack(MF) && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -653,14 +670,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers are pushed on stack before the stack - // is realigned. - FrameSize -= X86FI->getCalleeSavedFrameSize(); - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); - } + + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + + // Callee-saved registers are pushed on stack before the stack is realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -696,11 +711,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } - // Update EBP with the new base value. - BuildMI(MBB, MBBI, DL, - TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) - .addReg(StackPtr) - .setMIFlag(MachineInstr::FrameSetup); + if (!IsWinEH) { + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), + FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. @@ -749,15 +767,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - if (RegInfo->needsStackRealignment(MF)) { + // Don't do this for Win64, it needs to realign the stack after the prologue. + if (!IsWinEH && RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); uint64_t Val = -MaxAlign; MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -768,10 +787,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); - // If there is an ADD32ri or SUB32ri of ESP immediately after this - // instruction, merge the two instructions. - mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); - // Adjust stack pointer: ESP -= numbytes. // Windows and cygwin/mingw require a prologue helper routine when allocating @@ -782,7 +797,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= StackProbeSize && UseStackProbe) { + uint64_t AlignedNumBytes = NumBytes; + if (IsWinEH && RegInfo->needsStackRealignment(MF)) + AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); + if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -800,9 +818,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + if (isUInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. @@ -835,68 +863,66 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { UseLEA, TII, *RegInfo); } + if (NeedsWinEH && NumBytes) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + int SEHFrameOffset = 0; - if (NeedsWinEH) { - if (HasFP) { - // We need to set frame base offset low enough such that all saved - // register offsets would be positive relative to it, but we can't - // just use NumBytes, because .seh_setframe offset must be <=240. - // So we pretend to have only allocated enough space to spill the - // non-volatile registers. - // We don't care about the rest of stack allocation, because unwinder - // will restore SP to (BP - SEHFrameOffset) - for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { - int offset = MFI->getObjectOffset(Info.getFrameIdx()); - SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); - } - SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant - - // This only needs to account for XMM spill slots, GPR slots - // are covered by the .seh_pushreg's emitted above. - unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); - if (Size) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) - .addImm(Size) - .setMIFlag(MachineInstr::FrameSetup); - } + if (IsWinEH && HasFP) { + SEHFrameOffset = calculateSetFPREG(NumBytes); + if (SEHFrameOffset) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + StackPtr, false, SEHFrameOffset); + else + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); - } else { - // SP will be the base register for restoring XMMs - if (NumBytes) { - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); - } - } } - // Skip the rest of register spilling code - while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + const MachineInstr *FrameInstr = &*MBBI; ++MBBI; - // Emit SEH info for non-GPRs - if (NeedsWinEH) { - for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { - unsigned Reg = Info.getReg(); - if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) - continue; - assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); - - int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); - Offset += SEHFrameOffset; - - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) - .addImm(Reg) - .addImm(Offset) - .setMIFlag(MachineInstr::FrameSetup); + if (NeedsWinEH) { + int FI; + if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (X86::FR64RegClass.contains(Reg)) { + int Offset = getFrameIndexOffset(MF, FI); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + } } + } + if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); + + // Realign stack after we spilled callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + // Win64 requires aligning the stack after the prologue. + if (IsWinEH && RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); } // If we need a base pointer, set it up here. It's whatever the value @@ -938,79 +964,92 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } } +bool X86FrameLowering::canUseLEAForSPInEpilogue( + const MachineFunction &MF) const { + // We can't use LEA instructions for adjusting the stack pointer if this is a + // leaf function in the Win64 ABI. Only ADD instructions may be used to + // deallocate the stack. + // This means that we can use LEA for SP in two situations: + // 1. We *aren't* using the Win64 ABI which means we are free to use LEA. + // 2. We *have* a frame pointer which means we are permitted to use LEA. + return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); +} + +/// Check whether or not the terminators of \p MBB needs to read EFLAGS. +static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { + for (const MachineInstr &MI : MBB.terminators()) { + bool BreakNext = false; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != X86::EFLAGS) + continue; + + // This terminator needs an eflag that is not defined + // by a previous terminator. + if (!MO.isDef()) + return true; + BreakNext = true; + } + if (BreakNext) + break; + } + return false; +} + void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI != MBB.end() && "Returning block has no instructions"); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc DL = MBBI->getDebugLoc(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); const bool Is64BitILP32 = STI.isTarget64BitILP32(); - bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); - unsigned MachineFramePtr = Is64BitILP32 ? - getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + unsigned MachineFramePtr = + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); - - switch (RetOpcode) { - default: - llvm_unreachable("Can only insert epilog into returning blocks"); - case X86::RETQ: - case X86::RETL: - case X86::RETIL: - case X86::RETIQ: - case X86::TCRETURNdi: - case X86::TCRETURNri: - case X86::TCRETURNmi: - case X86::TCRETURNdi64: - case X86::TCRETURNri64: - case X86::TCRETURNmi64: - case X86::EH_RETURN: - case X86::EH_RETURN64: - break; // These are ok - } + bool UseLEAForSP = canUseLEAForSPInEpilogue(MF); + // If we can use LEA for SP but we shouldn't, check that none + // of the terminators uses the eflags. Otherwise we will insert + // a ADD that will redefine the eflags and break the condition. + // Alternatively, we could move the ADD, but this may not be possible + // and is an optimization anyway. + if (UseLEAForSP && !MF.getSubtarget<X86Subtarget>().useLeaForSP()) + UseLEAForSP = terminatorsNeedFlagsAsInput(MBB); + // If that assert breaks, that means we do not do the right thing + // in canUseAsEpilogue. + assert((UseLEAForSP || !terminatorsNeedFlagsAsInput(MBB)) && + "We shouldn't have allowed this insertion point"); // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); - uint64_t MaxAlign = MFI->getMaxAlignment(); + uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else - MaxAlign = MaxAlign ? MaxAlign : 4; - } - if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers were pushed on stack before the stack - // was realigned. - FrameSize -= CSSize; - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - CSSize; - } + NumBytes = FrameSize - CSSize; + + // Callee-saved registers were pushed on stack before the stack was + // realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); // Pop EBP. BuildMI(MBB, MBBI, DL, @@ -1018,6 +1057,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } else { NumBytes = StackSize - CSSize; } + uint64_t SEHStackAllocAmt = NumBytes; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { @@ -1032,7 +1072,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } MachineBasicBlock::iterator FirstCSPop = MBBI; - DL = MBBI->getDebugLoc(); + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. @@ -1045,10 +1086,20 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; - if (CSSize != 0) { + unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); + uint64_t LEAAmount = IsWinEH ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + + // There are only two legal forms of epilogue: + // - add SEHAllocationSize, %rsp + // - lea SEHAllocationSize(%FramePtr), %rsp + // + // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. + // However, we may use this sequence if we have a frame pointer because the + // effects of the prologue can safely be undone. + if (LEAAmount != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), - FramePtr, false, -CSSize); + FramePtr, false, LEAAmount); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); @@ -1058,8 +1109,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, - TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); --MBBI; } @@ -1072,101 +1123,66 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); - // We're returning from function via eh_return. - if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &DestAddr = MBBI->getOperand(0); - assert(DestAddr.isReg() && "Offset should be in register!"); - BuildMI(MBB, MBBI, DL, - TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), - StackPtr).addReg(DestAddr.getReg()); - } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || - RetOpcode == X86::TCRETURNmi || - RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || - RetOpcode == X86::TCRETURNmi64) { - bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; - // Tail call return: adjust the stack pointer and jump to callee. - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); - assert(StackAdjust.isImm() && "Expecting immediate value."); - - // Adjust stack pointer. - int StackAdj = StackAdjust.getImm(); - int MaxTCDelta = X86FI->getTCReturnAddrDelta(); - int Offset = 0; - assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); - - // Incoporate the retaddr area. - Offset = StackAdj-MaxTCDelta; - assert(Offset >= 0 && "Offset should never be negative"); - - if (Offset) { - // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, - UseLEA, TII, *RegInfo); - } - - // Jump to label or value in register. - if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) - ? X86::TAILJMPd : X86::TAILJMPd64)); - if (JumpTarget.isGlobal()) - MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); - else { - assert(JumpTarget.isSymbol()); - MIB.addExternalSymbol(JumpTarget.getSymbolName(), - JumpTarget.getTargetFlags()); - } - } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) - ? X86::TAILJMPm : X86::TAILJMPm64)); - for (unsigned i = 0; i != 5; ++i) - MIB.addOperand(MBBI->getOperand(i)); - } else if (RetOpcode == X86::TCRETURNri64) { - BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). - addReg(JumpTarget.getReg(), RegState::Kill); - } else { - BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). - addReg(JumpTarget.getReg(), RegState::Kill); - } - - MachineInstr *NewMI = std::prev(MBBI); - NewMI->copyImplicitOps(MF, MBBI); - - // Delete the pseudo instruction TCRETURN. - MBB.erase(MBBI); - } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL || - RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) && - (X86FI->getTCReturnAddrDelta() < 0)) { - // Add the return addr area delta back since we are not tail calling. - int delta = -1*X86FI->getTCReturnAddrDelta(); - MBBI = MBB.getLastNonDebugInstr(); + // Add the return addr area delta back since we are not tail calling. + int Offset = -1 * X86FI->getTCReturnAddrDelta(); + assert(Offset >= 0 && "TCDelta should never be positive"); + if (Offset) { + MBBI = MBB.getFirstTerminator(); // Check for possible merge with preceding ADD instruction. - delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, - *RegInfo); + Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); } } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Offset will hold the offset from the stack pointer at function entry to the + // object. + // We need to factor in additional offsets applied during the prologue to the + // frame, base, and stack pointer depending on which is used. int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI->getStackSize(); + unsigned SlotSize = RegInfo->getSlotSize(); + bool HasFP = hasFP(MF); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int64_t FPDelta = 0; + + if (IsWinEH) { + assert(!MFI->hasCalls() || (StackSize % 16) == 8); + + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + uint64_t NumBytes = FrameSize - CSSize; + + uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); + if (FI && FI == X86FI->getFAIndex()) + return -SEHFrameOffset; + + // FPDelta is the offset from the "traditional" FP location of the old base + // pointer followed by return address and the location required by the + // restricted Win64 prologue. + // Add FPDelta to all offsets below that go through the frame pointer. + FPDelta = FrameSize - SEHFrameOffset; + assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && + "FPDelta isn't aligned per the Win64 ABI!"); + } + if (RegInfo->hasBasePointer(MF)) { - assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; @@ -1174,33 +1190,32 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, } else if (RegInfo->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls } else { - if (!hasFP(MF)) + if (!HasFP) return Offset + StackSize; // Skip the saved EBP. - Offset += RegInfo->getSlotSize(); + Offset += SlotSize; // Skip the RETADDR move area - const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } - return Offset; + return Offset + FPDelta; } int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. @@ -1221,7 +1236,7 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F { #ifndef NDEBUG const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); // Note: LLVM arranges the stack as: // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) // > "Stack Slots" (<--SP) @@ -1275,11 +1290,11 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F return Offset + StackSize; } // Simplified from getFrameIndexReference keeping only StackPointer cases -int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); - + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); FrameReg = RegInfo->getStackRegister(); @@ -1291,7 +1306,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( std::vector<CalleeSavedInfo> &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1358,8 +1373,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; @@ -1379,8 +1394,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // It can be done by spilling XMMs to stack frame. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); @@ -1406,8 +1420,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -1438,7 +1452,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + MF.getSubtarget<X86Subtarget>().getRegisterInfo(); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1515,13 +1529,12 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr // limit. static const uint64_t kSplitStackAvailable = 256; -void -X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { - MachineBasicBlock &prologueMBB = MF.front(); +void X86FrameLowering::adjustForSegmentedStacks( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); uint64_t StackSize; - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; @@ -1559,8 +1572,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), - e = prologueMBB.livein_end(); i != e; i++) { + for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), + e = PrologueMBB.livein_end(); + i != e; i++) { allocMBB->addLiveIn(*i); checkMBB->addLiveIn(*i); } @@ -1674,7 +1688,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -1741,10 +1755,10 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { else BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); - allocMBB->addSuccessor(&prologueMBB); + allocMBB->addSuccessor(&PrologueMBB); checkMBB->addSuccessor(allocMBB); - checkMBB->addSuccessor(&prologueMBB); + checkMBB->addSuccessor(&PrologueMBB); #ifdef XDEBUG MF.verify(); @@ -1766,13 +1780,12 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// call inc_stack # doubles the stack space /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart -void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); +void X86FrameLowering::adjustForHiPEPrologue( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = - static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) - ->getSlotSize(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const unsigned SlotSize = STI.getRegisterInfo()->getSlotSize(); const bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; @@ -1837,12 +1850,12 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // If the stack frame needed is larger than the guaranteed then runtime checks // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. if (MaxStack > Guaranteed) { - MachineBasicBlock &prologueMBB = MF.front(); MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), - E = prologueMBB.livein_end(); I != E; I++) { + for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(), + E = PrologueMBB.livein_end(); + I != E; I++) { stackCheckMBB->addLiveIn(*I); incStackMBB->addLiveIn(*I); } @@ -1878,7 +1891,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -1889,9 +1902,9 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - stackCheckMBB->addSuccessor(&prologueMBB, 99); + stackCheckMBB->addSuccessor(&PrologueMBB, 99); stackCheckMBB->addSuccessor(incStackMBB, 1); - incStackMBB->addSuccessor(&prologueMBB, 99); + incStackMBB->addSuccessor(&PrologueMBB, 99); incStackMBB->addSuccessor(incStackMBB, 1); } #ifdef XDEBUG @@ -1902,14 +1915,13 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const X86RegisterInfo &RegInfo = *STI.getRegisterInfo(); unsigned StackPtr = RegInfo.getStackRegister(); bool reserveCallFrame = hasReservedCallFrame(MF); - int Opcode = I->getOpcode(); + unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); - const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; @@ -1926,11 +1938,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment(); - Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + unsigned StackAlign = getStackAlignment(); + Amount = RoundUpToAlignment(Amount, StackAlign); MachineInstr *New = nullptr; @@ -1983,3 +1992,15 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, } } +bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + assert(MBB.getParent() && "Block is not attached to a function!"); + + if (canUseLEAForSPInEpilogue(*MBB.getParent())) + return true; + + // If we cannot use LEA to adjust SP, we may need to use ADD, which + // clobbers the EFLAGS. Check that none of the terminators reads the + // EFLAGS, and if one uses it, conservatively assume this is not + // safe to insert the epilogue here. + return !terminatorsNeedFlagsAsInput(MBB); +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 448a365..5d03b4d 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -18,10 +18,6 @@ namespace llvm { -class MCSymbol; -class X86TargetMachine; -class X86Subtarget; - class X86FrameLowering : public TargetFrameLowering { public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) @@ -39,12 +35,14 @@ public: /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. - void emitPrologue(MachineFunction &MF) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void adjustForSegmentedStacks(MachineFunction &MF) const override; + void adjustForSegmentedStacks(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; - void adjustForHiPEPrologue(MachineFunction &MF) const override; + void adjustForHiPEPrologue(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const override; @@ -81,6 +79,33 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + /// Check the instruction before/after the passed instruction. If + /// it is an ADD/SUB/LEA instruction it is deleted argument and the + /// stack adjustment is returned as a positive value for ADD/LEA and + /// a negative for SUB. + static int mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, bool doMergeWithPrevious); + + /// Emit a series of instructions to increment / decrement the stack + /// pointer by a constant value. + static void emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, unsigned StackPtr, + int64_t NumBytes, bool Is64BitTarget, + bool Is64BitStackPtr, bool UseLEA, + const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI); + + /// Check that LEA can be used on SP in an epilogue sequence for \p MF. + bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const; + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + private: /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 7fe8e8b..de59109 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -156,9 +156,7 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), - Subtarget(&tm.getSubtarget<X86Subtarget>()), - OptForSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -166,7 +164,7 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. - Subtarget = &TM.getSubtarget<X86Subtarget>(); + Subtarget = &MF.getSubtarget<X86Subtarget>(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -206,6 +204,9 @@ namespace { bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); bool SelectMOV64Imm32(SDValue N, SDValue &Imm); bool SelectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -230,19 +231,20 @@ namespace { /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for /// inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, + unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); + void EmitSpecialCodeForMain(); - inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, - SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment) { + inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI->getPointerTy()) : AM.Base_Reg; - Scale = getI8Imm(AM.Scale); + Scale = getI8Imm(AM.Scale, DL); Index = AM.IndexReg; // These are 32-bit even in 64-bit mode since RIP relative offset // is 32-bit. @@ -263,7 +265,7 @@ namespace { Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, AM.SymbolFlags); else - Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32); + Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); if (AM.Segment.getNode()) Segment = AM.Segment; @@ -273,14 +275,14 @@ namespace { /// getI8Imm - Return a target constant with the specified value, of type /// i8. - inline SDValue getI8Imm(unsigned Imm) { - return CurDAG->getTargetConstant(Imm, MVT::i8); + inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { + return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } /// getI32Imm - Return a target constant with the specified value, of type /// i32. - inline SDValue getI32Imm(unsigned Imm) { - return CurDAG->getTargetConstant(Imm, MVT::i32); + inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { + return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } /// getGlobalBaseReg - Return an SDNode that returns the value of @@ -298,7 +300,7 @@ namespace { /// getInstrInfo - Return a reference to the TargetInstrInfo, casted /// to the target-specific type. const X86InstrInfo *getInstrInfo() const { - return getTargetMachine().getSubtargetImpl()->getInstrInfo(); + return Subtarget->getInstrInfo(); } /// \brief Address-mode matching performs shift-of-and to and-of-shift @@ -395,17 +397,14 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, Ops.clear(); Ops.push_back(NewChain); } - for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i) - Ops.push_back(OrigChain.getOperand(i)); + Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); - unsigned NumOps = Call.getNode()->getNumOperands(); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); - for (unsigned i = 1, e = NumOps; i != e; ++i) - Ops.push_back(Call.getOperand(i)); + Ops.append(Call->op_begin() + 1, Call->op_end()); CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } @@ -453,8 +452,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { @@ -571,14 +569,18 @@ void X86DAGToDAGISel::PreprocessISelDAG() { /// EmitSpecialCodeForMain - Emit any code that needs to be executed only in /// the main function. -void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, - MachineFrameInfo *MFI) { - const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo(); +void X86DAGToDAGISel::EmitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { - unsigned CallOp = - Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; - BuildMI(BB, DebugLoc(), - TII->get(CallOp)).addExternalSymbol("__main"); + TargetLowering::ArgListTy Args; + + TargetLowering::CallLoweringInfo CLI(*CurDAG); + CLI.setChain(CurDAG->getRoot()) + .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), + CurDAG->getExternalSymbol("__main", TLI->getPointerTy()), + std::move(Args), 0); + const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); + std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); + CurDAG->setRoot(Result.second); } } @@ -586,7 +588,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. if (const Function *Fn = MF->getFunction()) if (Fn->hasExternalLinkage() && Fn->getName() == "main") - EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo()); + EmitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -601,6 +603,9 @@ static bool isDispSafeForFrameIndex(int64_t Val) { bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { + // Cannot combine ExternalSymbol displacements with integer offsets. + if (Offset != 0 && AM.ES) + return true; int64_t Val = AM.Disp + Offset; CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit()) { @@ -803,11 +808,11 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, MVT VT = N.getSimpleValueType(); SDLoc DL(N); - SDValue Eight = DAG.getConstant(8, MVT::i8); - SDValue NewMask = DAG.getConstant(0xff, VT); + SDValue Eight = DAG.getConstant(8, DL, MVT::i8); + SDValue NewMask = DAG.getConstant(0xff, DL, VT); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); - SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8); + SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); // Insert the new nodes into the topological ordering. We must do this in @@ -851,7 +856,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, MVT VT = N.getSimpleValueType(); SDLoc DL(N); - SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT); + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); @@ -918,7 +923,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; // We also need to ensure that mask is a continuous run of bits. - if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; // Scale the leading zero count down based on the actual size of the value. // Also scale it down based on the size of the shift. @@ -957,9 +962,9 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, X = NewX; } SDLoc DL(N); - SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); - SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); // Insert the new nodes into the topological ordering. We must do this in @@ -1006,6 +1011,17 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, switch (N.getOpcode()) { default: break; + case ISD::FRAME_ALLOC_RECOVER: { + if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) + if (const auto *ESNode = dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) + if (ESNode->getOpcode() == ISD::TargetExternalSymbol) { + // Use the symbol and don't prefix it. + AM.ES = ESNode->getSymbol(); + AM.SymbolFlags = X86II::MO_NOPREFIX; + return false; + } + break; + } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); if (!FoldOffsetIntoAddress(Val, AM)) @@ -1191,7 +1207,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } // Ok, the transformation is legal and appears profitable. Go for it. - SDValue Zero = CurDAG->getConstant(0, N.getValueType()); + SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType()); SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); AM.IndexReg = Neg; AM.Scale = 1; @@ -1309,6 +1325,42 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } +bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + + MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent); + if (!Mgs) + return false; + X86ISelAddressMode AM; + unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace(); + // AddrSpace 256 -> GS, 257 -> FS. + if (AddrSpace == 256) + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + if (AddrSpace == 257) + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + + SDLoc DL(N); + Base = Mgs->getBasePtr(); + Index = Mgs->getIndex(); + unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits(); + Scale = getI8Imm(ScalarSize/8, DL); + + // If Base is 0, the whole address is in index and the Scale is 1 + if (isa<ConstantSDNode>(Base)) { + assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() && + "Unexpected base in gather/scatter"); + Scale = getI8Imm(1, DL); + Base = CurDAG->getRegister(0, MVT::i32); + } + if (AM.Segment.getNode()) + Segment = AM.Segment; + else + Segment = CurDAG->getRegister(0, MVT::i32); + Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; +} + /// SelectAddr - returns true if it is able pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. @@ -1350,7 +1402,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, if (!AM.IndexReg.getNode()) AM.IndexReg = CurDAG->getRegister(0, VT); - getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); return true; } @@ -1406,7 +1458,7 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { if ((uint32_t)ImmVal != (uint64_t)ImmVal) return false; - Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64); + Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); return true; } @@ -1442,9 +1494,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, // Base could already be %rip, particularly in the x32 ABI. Base = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, MVT::i64), + CurDAG->getTargetConstant(0, DL, MVT::i64), Base, - CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)), 0); } @@ -1456,9 +1508,10 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, "Expect to be extending 32-bit registers for use in LEA"); Index = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, - CurDAG->getTargetConstant(0, MVT::i64), + CurDAG->getTargetConstant(0, DL, MVT::i64), Index, - CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + CurDAG->getTargetConstant(X86::sub_32bit, DL, + MVT::i32)), 0); } @@ -1524,7 +1577,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, if (Complexity <= 2) return false; - getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); return true; } @@ -1548,7 +1601,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, AM.IndexReg = CurDAG->getRegister(0, MVT::i64); } - getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); return true; } @@ -1718,7 +1771,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, // an immediate operand to sub. However, it still fits in 32 bits for the // add (since it is not negated) so we can return target-constant. if (CNVal == INT32_MIN) - return CurDAG->getTargetConstant(CNVal, NVT); + return CurDAG->getTargetConstant(CNVal, dl, NVT); // For atomic-load-add, we could do some optimizations. if (Op == ADD) { // Translate to INC/DEC if ADD by 1 or -1. @@ -1733,7 +1786,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, CNVal = -CNVal; } } - return CurDAG->getTargetConstant(CNVal, NVT); + return CurDAG->getTargetConstant(CNVal, dl, NVT); } // If the value operand is single-used, try to optimize it. @@ -2046,12 +2099,14 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), MVT::Other); + SDLoc DL(Node); + // Memory Operands: Base, Scale, Index, Disp, Segment - SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32); + SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i32); - const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx, + const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx, Disp, Segment, VMask, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops); + SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); // Node has 2 outputs: VDst and MVT::Other. // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. // We replace VDst of Node with VDst of ResNode, and Other of Node with Other @@ -2180,7 +2235,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) break; - unsigned ShlOp, Op; + unsigned ShlOp, AddOp, Op; MVT CstVT = NVT; // Check the minimum bitwidth for the new constant. @@ -2201,6 +2256,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case MVT::i32: assert(CstVT == MVT::i8); ShlOp = X86::SHL32ri; + AddOp = X86::ADD32rr; switch (Opcode) { default: llvm_unreachable("Impossible opcode"); @@ -2212,6 +2268,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case MVT::i64: assert(CstVT == MVT::i8 || CstVT == MVT::i32); ShlOp = X86::SHL64ri; + AddOp = X86::ADD64rr; switch (Opcode) { default: llvm_unreachable("Impossible opcode"); @@ -2223,10 +2280,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } // Emit the smaller op and the shift. - SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT); SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + if (ShlVal == 1) + return CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0), + SDValue(New, 0)); return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), - getI8Imm(ShlVal)); + getI8Imm(ShlVal, dl)); } case X86ISD::UMUL8: case X86ISD::SMUL8: { @@ -2390,7 +2450,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Shift AX down 8 bits. Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, Result, - CurDAG->getTargetConstant(8, MVT::i8)), 0); + CurDAG->getTargetConstant(8, dl, MVT::i8)), + 0); // Then truncate it down to i8. ReplaceUses(SDValue(Node, 1), CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); @@ -2510,7 +2571,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, - CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)), + CurDAG->getTargetConstant(X86::sub_16bit, dl, + MVT::i32)), 0); break; case MVT::i32: @@ -2519,8 +2581,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, - CurDAG->getTargetConstant(0, MVT::i64), ClrNode, - CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, + CurDAG->getTargetConstant(X86::sub_32bit, dl, + MVT::i32)), 0); break; default: @@ -2572,8 +2635,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { Result = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, - CurDAG->getTargetConstant(0, MVT::i64), Result, - CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + CurDAG->getTargetConstant(0, dl, MVT::i64), Result, + CurDAG->getTargetConstant(X86::sub_32bit, dl, + MVT::i32)), 0); } } else { @@ -2612,26 +2676,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) { - // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a - // smaller encoding - if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 && - X86::isZeroNode(N1)) { - SDValue Reg = N0.getOperand(0); - SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8); - - // Emit testb - if (Reg.getScalarValueSizeInBits() > 8) - Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg); - // Emit a testb. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, - Reg, Imm); - ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); - return nullptr; - } - + HasNoSignedComparisonUses(Node)) N0 = N0.getOperand(0); - } + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. @@ -2647,7 +2694,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && (!(C->getZExtValue() & 0x80) || HasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Reg = N0.getNode()->getOperand(0); // On x86-32, only the ABCD registers have 8-bit subregisters. @@ -2658,7 +2705,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; default: llvm_unreachable("Unsupported TEST operand type!"); } - SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, Reg.getValueType(), Reg, RC), 0); } @@ -2683,7 +2730,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { HasNoSignedComparisonUses(Node))) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, - MVT::i8); + dl, MVT::i8); SDValue Reg = N0.getNode()->getOperand(0); // Put the value in an ABCD register. @@ -2694,7 +2741,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; default: llvm_unreachable("Unsupported TEST operand type!"); } - SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, Reg.getValueType(), Reg, RC), 0); @@ -2719,7 +2766,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { N0.getValueType() != MVT::i16 && (!(C->getZExtValue() & 0x8000) || HasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16); + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, + MVT::i16); SDValue Reg = N0.getNode()->getOperand(0); // Extract the 16-bit subregister. @@ -2741,7 +2789,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { N0.getValueType() == MVT::i64 && (!(C->getZExtValue() & 0x80000000) || HasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, + MVT::i32); SDValue Reg = N0.getNode()->getOperand(0); // Extract the 32-bit subregister. @@ -2824,14 +2873,20 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } bool X86DAGToDAGISel:: -SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { SDValue Op0, Op1, Op2, Op3, Op4; - switch (ConstraintCode) { - case 'o': // offsetable ?? - case 'v': // not offsetable ?? - default: return true; - case 'm': // memory + switch (ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + // FIXME: It seems strange that 'i' is needed here since it's supposed to + // be an immediate and not a memory constraint. + // Fallthrough. + case InlineAsm::Constraint_o: // offsetable ?? + case InlineAsm::Constraint_v: // not offsetable ?? + case InlineAsm::Constraint_m: // memory + case InlineAsm::Constraint_X: if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 85978d8..3ba8115 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -33,6 +32,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" @@ -67,18 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -static cl::opt<bool> ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(true), - cl::desc("Enable an experimental vector shuffle lowering code path."), - cl::Hidden); - -static cl::opt<bool> ExperimentalVectorShuffleLegality( - "x86-experimental-vector-shuffle-legality", cl::init(false), - cl::desc("Enable experimental shuffle legality based on the experimental " - "shuffle lowering. Should only be used with the experimental " - "shuffle lowering."), - cl::Hidden); - static cl::opt<int> ReciprocalEstimateRefinementSteps( "x86-recip-refinement-steps", cl::init(1), cl::desc("Specify the number of Newton-Raphson iterations applied to the " @@ -89,147 +77,13 @@ static cl::opt<int> ReciprocalEstimateRefinementSteps( static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl, - unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - unsigned Factor = VT.getSizeInBits()/vectorWidth; - EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, - VT.getVectorNumElements()/Factor); - - // Extract from UNDEF is UNDEF. - if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getUNDEF(ResultVT); - - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR - unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); - - // If the input is a buildvector just emit a smaller one. - if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); -} - -/// Generate a DAG to grab 128-bits from a vector > 128 bits. This -/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 -/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 -/// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert((Vec.getValueType().is256BitVector() || - Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); -} - -/// Generate a DAG to grab 256-bits from a 512-bit vector. -static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); - return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); -} - -static SDValue InsertSubVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl, unsigned vectorWidth) { - assert((vectorWidth == 128 || vectorWidth == 256) && - "Unsupported vector width"); - // Inserting UNDEF is Result - if (Vec.getOpcode() == ISD::UNDEF) - return Result; - EVT VT = Vec.getValueType(); - EVT ElVT = VT.getVectorElementType(); - EVT ResultVT = Result.getValueType(); - - // Insert the relevant vectorWidth bits. - unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); - - // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); - - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); -} - -/// Generate a DAG to put 128-bits into a vector > 128 bits. This -/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or -/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a -/// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit boundary. That makes -/// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG,SDLoc dl) { - assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); -} - -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, - SelectionDAG &DAG, SDLoc dl) { - assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); - return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); -} - -/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 -/// instructions. This is used because creating CONCAT_VECTOR nodes of -/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower -/// large BUILD_VECTORS. -static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert128BitVector(V, V2, NumElems/2, DAG, dl); -} - -static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - SDLoc dl) { - SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); - return Insert256BitVector(V, V2, NumElems/2, DAG, dl); -} - -// FIXME: This should stop caching the target machine as soon as -// we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) - : TargetLowering(TM) { - Subtarget = &TM.getSubtarget<X86Subtarget>(); +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); TD = getDataLayout(); - resetOperationActions(); -} - -void X86TargetLowering::resetOperationActions() { - const TargetMachine &TM = getTargetMachine(); - static bool FirstTimeThrough = true; - - // If none of the target options have changed, then we don't need to reset the - // operation actions. - if (!FirstTimeThrough && TO == TM.Options) return; - - if (!FirstTimeThrough) { - // Reinitialize the actions. - initActions(); - FirstTimeThrough = false; - } - - TO = TM.Options; - // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -247,8 +101,7 @@ void X86TargetLowering::resetOperationActions() { setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); - const X86RegisterInfo *RegInfo = - TM.getSubtarget<X86Subtarget>().getRegisterInfo(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2. @@ -330,7 +183,7 @@ void X86TargetLowering::resetOperationActions() { if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - } else if (!TM.Options.UseSoftFloat) { + } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -344,7 +197,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!TM.Options.UseSoftFloat) { + if (!Subtarget->useSoftFloat()) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); @@ -387,7 +240,7 @@ void X86TargetLowering::resetOperationActions() { if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - } else if (!TM.Options.UseSoftFloat) { + } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. @@ -515,7 +368,7 @@ void X86TargetLowering::resetOperationActions() { // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { + if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -660,7 +513,11 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); - if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { + // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. + setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); + setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); + + if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); @@ -694,7 +551,7 @@ void X86TargetLowering::resetOperationActions() { // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { + } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); @@ -729,7 +586,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } - } else if (!TM.Options.UseSoftFloat) { + } else if (!Subtarget->useSoftFloat()) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); @@ -763,7 +620,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. - if (!TM.Options.UseSoftFloat) { + if (!Subtarget->useSoftFloat()) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -893,49 +750,35 @@ void X86TargetLowering::resetOperationActions() { // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + + // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are + // split/scalarized right now. + if (VT.getVectorElementType() == MVT::f16) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. - setOperationAction(ISD::MULHS, MVT::v8i8, Expand); - setOperationAction(ISD::MULHS, MVT::v4i16, Expand); - setOperationAction(ISD::MULHS, MVT::v2i32, Expand); - setOperationAction(ISD::MULHS, MVT::v1i64, Expand); - setOperationAction(ISD::AND, MVT::v8i8, Expand); - setOperationAction(ISD::AND, MVT::v4i16, Expand); - setOperationAction(ISD::AND, MVT::v2i32, Expand); - setOperationAction(ISD::AND, MVT::v1i64, Expand); - setOperationAction(ISD::OR, MVT::v8i8, Expand); - setOperationAction(ISD::OR, MVT::v4i16, Expand); - setOperationAction(ISD::OR, MVT::v2i32, Expand); - setOperationAction(ISD::OR, MVT::v1i64, Expand); - setOperationAction(ISD::XOR, MVT::v8i8, Expand); - setOperationAction(ISD::XOR, MVT::v4i16, Expand); - setOperationAction(ISD::XOR, MVT::v2i32, Expand); - setOperationAction(ISD::XOR, MVT::v1i64, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { + setOperationAction(ISD::MULHS, MMXTy, Expand); + setOperationAction(ISD::AND, MMXTy, Expand); + setOperationAction(ISD::OR, MMXTy, Expand); + setOperationAction(ISD::XOR, MMXTy, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); + setOperationAction(ISD::SELECT, MMXTy, Expand); + setOperationAction(ISD::BITCAST, MMXTy, Expand); + } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); - setOperationAction(ISD::SELECT, MVT::v8i8, Expand); - setOperationAction(ISD::SELECT, MVT::v4i16, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v1i64, Expand); - setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); - setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); - setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); - setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - - if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { + + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -948,12 +791,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } - if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM @@ -967,6 +811,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); @@ -1016,6 +861,7 @@ void X86TargetLowering::resetOperationActions() { continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } @@ -1039,6 +885,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); @@ -1094,39 +942,20 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } - if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FRINT, MVT::v4f32, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); - setOperationAction(ISD::FRINT, MVT::v2f64, Legal); - setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { + for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + } // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); - // There is no BLENDI for byte vectors. We don't need to custom lower - // some vselects for now. + // We directly match byte blends in the backend as they match the VSELECT + // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in @@ -1137,6 +966,21 @@ void X86TargetLowering::resetOperationActions() { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -1160,6 +1004,10 @@ void X86TargetLowering::resetOperationActions() { } if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); @@ -1180,7 +1028,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v4i32, Custom); } - if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); @@ -1252,11 +1100,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); - setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); - setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); @@ -1293,16 +1136,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -1316,6 +1156,21 @@ void X86TargetLowering::resetOperationActions() { // Custom CTPOP always performs better on natively supported v8i32 setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + + // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1330,7 +1185,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::MUL, MVT::v32i8, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -1360,6 +1215,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); @@ -1367,6 +1223,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } + if (Subtarget->hasInt256()) + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1388,7 +1248,7 @@ void X86TargetLowering::resetOperationActions() { } } - if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); @@ -1401,11 +1261,27 @@ void X86TargetLowering::resetOperationActions() { for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); + setOperationAction(ISD::SUB, MVT::i1, Custom); + setOperationAction(ISD::ADD, MVT::i1, Custom); + setOperationAction(ISD::MUL, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); @@ -1450,28 +1326,49 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + if (Subtarget->hasDQI()) { + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + if (Subtarget->hasDQI()) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); + } + setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); + setOperationAction(ISD::FRINT, MVT::v16f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); @@ -1488,6 +1385,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); + setOperationAction(ISD::SELECT, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8i1, Custom); setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); @@ -1517,10 +1416,23 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); } - + if (Subtarget->hasDQI()) { + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i64, Legal); + } // Custom lower several nodes. for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize == 1) { + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + } + if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + } // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) { @@ -1533,7 +1445,7 @@ void X86TargetLowering::resetOperationActions() { if (!VT.is512BitVector()) continue; - if ( EltSize >= 32) { + if (EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -1557,7 +1469,7 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 - if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); @@ -1573,6 +1485,24 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::SELECT, MVT::v32i1, Custom); + setOperationAction(ISD::SELECT, MVT::v64i1, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { const MVT VT = (MVT::SimpleValueType)i; @@ -1590,13 +1520,20 @@ void X86TargetLowering::resetOperationActions() { } } - if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); + setOperationAction(ISD::SELECT, MVT::v4i1, Custom); + setOperationAction(ISD::SELECT, MVT::v2i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -1604,13 +1541,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::AND, MVT::v4i32, Legal); setOperationAction(ISD::OR, MVT::v4i32, Legal); setOperationAction(ISD::XOR, MVT::v4i32, Legal); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); + setOperationAction(ISD::SRA, MVT::v4i64, Custom); } - // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion - // of this type with custom code. - for (MVT VT : MVT::vector_valuetypes()) - setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); - // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); @@ -1667,6 +1601,7 @@ void X86TargetLowering::resetOperationActions() { // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1687,16 +1622,14 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); - setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); - if (Subtarget->is64Bit()) - setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); - computeRegisterProperties(); + computeRegisterProperties(Subtarget->getRegisterInfo()); // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. @@ -1837,8 +1770,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) { + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && @@ -1898,6 +1830,10 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { return TargetLowering::getJumpTableEncoding(); } +bool X86TargetLowering::useSoftFloat() const { + return Subtarget->useSoftFloat(); +} + const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, @@ -1933,14 +1869,14 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } -// FIXME: Why this routine is here? Move to RegInfo! -std::pair<const TargetRegisterClass*, uint8_t> -X86TargetLowering::findRepresentativeClass(MVT VT) const{ +std::pair<const TargetRegisterClass *, uint8_t> +X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: - return TargetLowering::findRepresentativeClass(VT); + return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; @@ -2023,7 +1959,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, SmallVector<SDValue, 6> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop - RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i16)); // Copy the result values into the output registers. @@ -2038,8 +1974,12 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); - else if (VA.getLocInfo() == CCValAssign::AExt) - ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) { + if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); @@ -2094,19 +2034,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } - // The x86-64 ABIs require that for returning structs by value we copy + // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. - // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. - if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { - MachineFunction &MF = DAG.getMachineFunction(); - X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - unsigned Reg = FuncInfo->getSRetReturnReg(); - assert(Reg && - "SRetReturnReg should have been set in LowerFormalArguments()."); - SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); + // + // Checking Function.hasStructRetAttr() here is insufficient because the IR + // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is + // false, then an sret argument may be implicitly inserted in the SelDAG. In + // either case FuncInfo->setSRetReturnReg() will have been called. + if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy()); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? @@ -2200,7 +2138,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - EVT CopyVT = VA.getValVT(); + EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && @@ -2210,18 +2148,24 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. + bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && - isScalarFPTypeInSSEReg(VA.getValVT())) + isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; + RoundAfterCopy = (CopyVT != VA.getLocVT()); + } Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); SDValue Val = Chain.getValue(0); - if (CopyVT != VA.getValVT()) + if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. - DAG.getIntPtrConstant(1)); + DAG.getIntPtrConstant(1, dl)); + + if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); InFlag = Chain.getValue(2); InVals.push_back(Val); @@ -2281,10 +2225,11 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl) { - SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, + /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } @@ -2337,7 +2282,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, // If value is passed by pointer we have address passed instead of the value // itself. - if (VA.getLocInfo() == CCValAssign::Indirect) + bool ExtendedInMem = VA.isExtInLoc() && + VA.getValVT().getScalarType() == MVT::i1; + + if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); @@ -2355,9 +2303,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - return DAG.getLoad(ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, false, 0); + return ExtendedInMem ? + DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } } @@ -2393,12 +2343,11 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, } const Function *Fn = MF.getFunction(); - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && + bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); + bool isSoftFloat = Subtarget->useSoftFloat(); + assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasSSE1()) + if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; @@ -2421,6 +2370,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && @@ -2505,7 +2455,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. - if (RegVT.isVector()) + if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); @@ -2523,24 +2473,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, InVals.push_back(ArgValue); } - if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - // The x86-64 ABIs require that for returning structs by value we copy - // the sret argument into %rax/%eax (depending on ABI) for the return. - // Win32 requires us to put the sret argument to %eax as well. - // Save the argument into a virtual register so that we can access it - // from the return points. - if (Ins[i].Flags.isSRet()) { - unsigned Reg = FuncInfo->getSRetReturnReg(); - if (!Reg) { - MVT PtrTy = getPointerTy(); - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); - FuncInfo->setSRetReturnReg(Reg); - } - SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); - break; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // All x86 ABIs require that for returning structs by value we copy the + // sret argument into %rax/%eax (depending on ABI) for the return. Save + // the argument into a virtual register so that we can access it from the + // return points. + if (Ins[i].Flags.isSRet()) { + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + MVT PtrTy = getPointerTy(); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + break; } } @@ -2560,10 +2507,16 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MFI->CreateFixedObject(1, StackSize, true)); } + MachineModuleInfo &MMI = MF.getMMI(); + const Function *WinEHParent = nullptr; + if (IsWin64 && MMI.hasWinEHFuncInfo(Fn)) + WinEHParent = MMI.getWinEHParent(Fn); + bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; + bool IsWinEHParent = WinEHParent && WinEHParent == Fn; + // Figure out if XMM registers are in use. - assert(!(MF.getTarget().Options.UseSoftFloat && - Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) && + assert(!(Subtarget->useSoftFloat() && + Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we @@ -2572,10 +2525,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Find the first unallocated argument registers. ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); - unsigned NumIntRegs = - CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); - unsigned NumXMMRegs = - CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); @@ -2599,7 +2550,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } if (IsWin64) { - const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -2625,7 +2575,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset)); + DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( @@ -2641,9 +2591,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex())); + FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset())); + FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, @@ -2652,6 +2602,27 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } else if (IsWinEHOutlined) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject( + /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false)); + + MMI.getWinEHFuncInfo(Fn) + .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] = + FuncInfo->getRegSaveFrameIndex(); + + // Store the second integer parameter (rdx) into rsp+16 relative to the + // stack pointer at the entry of the function. + SDValue RSFIN = + DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); + unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); + Chain = DAG.getStore( + Val.getValue(1), dl, Val, RSFIN, + MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), + /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { @@ -2718,6 +2689,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); + if (IsWinEHParent) { + int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); + MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; + SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); + Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, + MachinePointerInfo::getFixedStack(UnwindHelpFI), + /*isVolatile=*/true, + /*isNonTemporal=*/false, /*Alignment=*/0); + } + return Chain; } @@ -2728,7 +2710,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); - SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); @@ -2874,7 +2856,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsSibcall) Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); + Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -2888,8 +2870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2912,7 +2893,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: - if (RegVT.is128BitVector()) { + if (Arg.getValueType().isVector() && + Arg.getValueType().getScalarType() == MVT::i1) + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); @@ -3002,12 +2986,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), - DAG.getConstant(NumXMMRegs, MVT::i8))); + DAG.getConstant(NumXMMRegs, dl, + MVT::i8))); } if (isVarArg && IsMustTail) { @@ -3051,7 +3036,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Flags.isByVal()) { // Copy relative to framepointer. - SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), @@ -3124,11 +3109,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; - } else if (Subtarget->isPICStyleRIPRel() && - isa<Function>(GV) && - cast<Function>(GV)->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, - Attribute::NonLazyBind)) { + } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && + cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -3168,7 +3150,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); - } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { + } else if (Subtarget->isTarget64BitILP32() && + Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } @@ -3179,8 +3162,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(NumBytesToPop, true), - DAG.getIntPtrConstant(0, true), InFlag, dl); + DAG.getIntPtrConstant(NumBytesToPop, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } @@ -3188,7 +3171,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(Callee); if (isTailCall) - Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); + Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. @@ -3197,8 +3180,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3212,6 +3195,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. + MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } @@ -3237,8 +3221,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(NumBytesToPop, true), - DAG.getIntPtrConstant(NumBytesForCalleeToPop, + DAG.getIntPtrConstant(NumBytesToPop, dl, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); @@ -3286,11 +3270,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const TargetMachine &TM = MF.getTarget(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); - const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3327,7 +3308,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return false; } else { unsigned Opcode = Def->getOpcode(); - if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); @@ -3392,6 +3374,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); + // Win64 functions have extra shadow space for argument homing. Don't do the + // sibcall if the caller and callee have mismatched expectations for this + // space. + if (IsCalleeWin64 != IsCallerWin64) + return false; + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; @@ -3403,8 +3391,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3516,8 +3503,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); - const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3614,17 +3600,6 @@ static bool isTargetShuffle(unsigned Opcode) { } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, - SDValue V1, SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: - case X86ISD::MOVDDUP: - return DAG.getNode(Opc, dl, VT, V1); - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3634,21 +3609,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, case X86ISD::PSHUFLW: case X86ISD::VPERMILPI: case X86ISD::VPERMI: - return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); - } -} - -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, - SDValue V1, SDValue V2, unsigned TargetMask, - SelectionDAG &DAG) { - switch(Opc) { - default: llvm_unreachable("Unknown x86 shuffle node"); - case X86ISD::PALIGNR: - case X86ISD::VALIGN: - case X86ISD::SHUFP: - case X86ISD::VPERM2X128: - return DAG.getNode(Opc, dl, VT, V1, V2, - DAG.getConstant(TargetMask, MVT::i8)); + return DAG.getNode(Opc, dl, VT, V1, + DAG.getConstant(TargetMask, dl, MVT::i8)); } } @@ -3671,8 +3633,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3759,13 +3720,13 @@ static bool isX86CCUnsigned(unsigned X86CC) { /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 /// specific condition code, returning the condition code and the LHS/RHS of the /// comparison to make. -static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. - RHS = DAG.getConstant(0, RHS.getValueType()); + RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { @@ -3774,7 +3735,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 - RHS = DAG.getConstant(0, RHS.getValueType()); + RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } @@ -3939,849 +3900,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFD. That is, it doesn't reference the other -/// operand - by default will match for first operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT, - bool TestSecondOperand = false) { - if (VT != MVT::v4f32 && VT != MVT::v4i32 && - VT != MVT::v2f64 && VT != MVT::v2i64) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - unsigned Lo = TestSecondOperand ? NumElems : 0; - unsigned Hi = Lo + NumElems; - - for (unsigned i = 0; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) - return false; - - return true; -} - -/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) - return false; - - // Lower quadword copied in order or undef. - if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) - return false; - - // Upper quadword shuffled. - for (unsigned i = 4; i != 8; ++i) - if (!isUndefOrInRange(Mask[i], 4, 8)) - return false; - - if (VT == MVT::v16i16) { - // Lower quadword copied in order or undef. - if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) - return false; - - // Upper quadword shuffled. - for (unsigned i = 12; i != 16; ++i) - if (!isUndefOrInRange(Mask[i], 12, 16)) - return false; - } - - return true; -} - -/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) - return false; - - // Upper quadword copied in order. - if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) - return false; - - // Lower quadword shuffled. - for (unsigned i = 0; i != 4; ++i) - if (!isUndefOrInRange(Mask[i], 0, 4)) - return false; - - if (VT == MVT::v16i16) { - // Upper quadword copied in order. - if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) - return false; - - // Lower quadword shuffled. - for (unsigned i = 8; i != 12; ++i) - if (!isUndefOrInRange(Mask[i], 8, 12)) - return false; - } - - return true; -} - -/// \brief Return true if the mask specifies a shuffle of elements that is -/// suitable for input to intralane (palignr) or interlane (valign) vector -/// right-shift. -static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - // Do not handle 64-bit element shuffles with palignr. - if (NumLaneElts == 2) - return false; - - for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { - unsigned i; - for (i = 0; i != NumLaneElts; ++i) { - if (Mask[i+l] >= 0) - break; - } - - // Lane is all undef, go to next lane - if (i == NumLaneElts) - continue; - - int Start = Mask[i+l]; - - // Make sure its in this lane in one of the sources - if (!isUndefOrInRange(Start, l, l+NumLaneElts) && - !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) - return false; - - // If not lane 0, then we must match lane 0 - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) - return false; - - // Correct second source to be contiguous with first source - if (Start >= (int)NumElts) - Start -= NumElts - NumLaneElts; - - // Make sure we're shifting in the right direction. - if (Start <= (int)(i+l)) - return false; - - Start -= i; - - // Check the rest of the elements to see if they are consecutive. - for (++i; i != NumLaneElts; ++i) { - int Idx = Mask[i+l]; - - // Make sure its in this lane - if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && - !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) - return false; - - // If not lane 0, then we must match lane 0 - if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) - return false; - - if (Idx >= (int)NumElts) - Idx -= NumElts - NumLaneElts; - - if (!isUndefOrEqual(Idx, Start+i)) - return false; - - } - } - - return true; -} - -/// \brief Return true if the node specifies a shuffle of elements that is -/// suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || - (VT.is256BitVector() && !Subtarget->hasInt256()) || - VT.is512BitVector()) - // FIXME: Add AVX512BW. - return false; - - return isAlignrMask(Mask, VT, false); -} - -/// \brief Return true if the node specifies a shuffle of elements that is -/// suitable for input to VALIGN. -static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - // FIXME: Add AVX512VL. - if (!VT.is512BitVector() || !Subtarget->hasAVX512()) - return false; - return isAlignrMask(Mask, VT, true); -} - -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, - unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128/256-bit -/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be -/// reverse of what x86 shuffles want. -static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) { - - unsigned NumElems = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElems = NumElems/NumLanes; - - if (NumLaneElems != 2 && NumLaneElems != 4) - return false; - - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - bool symetricMaskRequired = - (VT.getSizeInBits() >= 256) && (EltSize == 32); - - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - // VSHUFPDY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X3 X2 X1 X0 - // SRC2 => Y3 Y2 Y1 Y0 - // - // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 - // - SmallVector<int, 4> MaskVal(NumLaneElems, -1); - unsigned HalfLaneElems = NumLaneElems/2; - for (unsigned l = 0; l != NumElems; l += NumLaneElems) { - for (unsigned i = 0; i != NumLaneElems; ++i) { - int Idx = Mask[i+l]; - unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); - if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) - return false; - // For VSHUFPSY, the mask of the second half must be the same as the - // first but with the appropriate offsets. This works in the same way as - // VPERMILPS works with masks. - if (!symetricMaskRequired || Idx < 0) - continue; - if (MaskVal[i] < 0) { - MaskVal[i] = Idx - l; - continue; - } - if ((signed)(Idx - l) != MaskVal[i]) - return false; - } - } - - return true; -} - -/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVHLPS. -static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 4) - return false; - - // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 - return isUndefOrEqual(Mask[0], 6) && - isUndefOrEqual(Mask[1], 7) && - isUndefOrEqual(Mask[2], 2) && - isUndefOrEqual(Mask[3], 3); -} - -/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form -/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, -/// <2, 3, 2, 3> -static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 4) - return false; - - return isUndefOrEqual(Mask[0], 2) && - isUndefOrEqual(Mask[1], 3) && - isUndefOrEqual(Mask[2], 2) && - isUndefOrEqual(Mask[3], 3); -} - -/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i + NumElems)) - return false; - - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - return true; -} - -/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVLHPS. -static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i + e], i + NumElems)) - return false; - - return true; -} - -/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to INSERTPS. -/// i. e: If all but one element come from the same vector. -static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) { - // TODO: Deal with AVX's VINSERTPS - if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) - return false; - - unsigned CorrectPosV1 = 0; - unsigned CorrectPosV2 = 0; - for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { - if (Mask[i] == -1) { - ++CorrectPosV1; - ++CorrectPosV2; - continue; - } - - if (Mask[i] == i) - ++CorrectPosV1; - else if (Mask[i] == i + 4) - ++CorrectPosV2; - } - - if (CorrectPosV1 == 3 || CorrectPosV2 == 3) - // We have 3 elements (undefs count as elements from any vector) from one - // vector, and one from another. - return true; - - return false; -} - -// -// Some special combinations that can be optimized. -// -static -SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDLoc dl(SVOp); - - if (VT != MVT::v8i32 && VT != MVT::v8f32) - return SDValue(); - - ArrayRef<int> Mask = SVOp->getMask(); - - // These are the special masks that may be optimized. - static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; - static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; - bool MatchEvenMask = true; - bool MatchOddMask = true; - for (int i=0; i<8; ++i) { - if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) - MatchEvenMask = false; - if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) - MatchOddMask = false; - } - - if (!MatchEvenMask && !MatchOddMask) - return SDValue(); - - SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - - SDValue Op0 = SVOp->getOperand(0); - SDValue Op1 = SVOp->getOperand(1); - - if (MatchEvenMask) { - // Shift the second operand right to 32 bits. - static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; - Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); - } else { - // Shift the first operand left to 32 bits. - static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; - Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); - } - static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); -} - -/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, - bool HasInt256, bool V2IsSplat = false) { - - assert(VT.getSizeInBits() >= 128 && - "Unsupported vector type for unpckl"); - - unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && - "Unsupported vector type for unpckh"); - - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) - return false; - } - } - } - - return true; -} - -/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, - bool HasInt256, bool V2IsSplat = false) { - assert(VT.getSizeInBits() >= 128 && - "Unsupported vector type for unpckh"); - - unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && - "Unsupported vector type for unpckh"); - - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j+NumElts)) - return false; - } - } - } - return true; -} - -/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form -/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, -/// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - unsigned NumElts = VT.getVectorNumElements(); - bool Is256BitVec = VT.is256BitVector(); - - if (VT.is512BitVector()) - return false; - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); - - if (Is256BitVec && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern - // FIXME: Need a better way to get rid of this, there's no latency difference - // between UNPCKLPD and MOVDDUP, the later should always be checked first and - // the former later. We should also remove the "_undef" special mask. - if (NumElts == 4 && Is256BitVec) - return false; - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; - } - } - - return true; -} - -/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form -/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, -/// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { - unsigned NumElts = VT.getVectorNumElements(); - - if (VT.is512BitVector()) - return false; - - assert((VT.is128BitVector() || VT.is256BitVector()) && - "Unsupported vector type for unpckh"); - - if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) - return false; - - // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate - // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - for (unsigned l = 0; l != NumElts; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l+i]; - int BitI1 = Mask[l+i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; - } - } - return true; -} - -// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or -// (src1[0], src0[1]), manipulation with 256-bit sub-vectors -static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) { - if (!VT.is512BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfSize = NumElts/2; - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { - *Imm = 1; - return true; - } - } - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { - *Imm = 0; - return true; - } - } - return false; -} - -/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSS, -/// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { - if (VT.getVectorElementType().getSizeInBits() < 32) - return false; - if (!VT.is128BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - - if (!isUndefOrEqual(Mask[0], NumElts)) - return false; - - for (unsigned i = 1; i != NumElts; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - - return true; -} - -/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered -/// as permutations between 128-bit chunks or halves. As an example: this -/// shuffle bellow: -/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> -/// The first half comes from the second half of V1 and the second half from the -/// the second half of V2. -static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { - if (!HasFp256 || !VT.is256BitVector()) - return false; - - // The shuffle result is divided into half A and half B. In total the two - // sources have 4 halves, namely: C, D, E, F. The final values of A and - // B must come from C, D, E or F. - unsigned HalfSize = VT.getVectorNumElements()/2; - bool MatchA = false, MatchB = false; - - // Check if A comes from one of C, D, E, F. - for (unsigned Half = 0; Half != 4; ++Half) { - if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { - MatchA = true; - break; - } - } - - // Check if B comes from one of C, D, E, F. - for (unsigned Half = 0; Half != 4; ++Half) { - if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { - MatchB = true; - break; - } - } - - return MatchA && MatchB; -} - -/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. -static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { - MVT VT = SVOp->getSimpleValueType(0); - - unsigned HalfSize = VT.getVectorNumElements()/2; - - unsigned FstHalf = 0, SndHalf = 0; - for (unsigned i = 0; i < HalfSize; ++i) { - if (SVOp->getMaskElt(i) > 0) { - FstHalf = SVOp->getMaskElt(i)/HalfSize; - break; - } - } - for (unsigned i = HalfSize; i < HalfSize*2; ++i) { - if (SVOp->getMaskElt(i) > 0) { - SndHalf = SVOp->getMaskElt(i)/HalfSize; - break; - } - } - - return (FstHalf | (SndHalf << 4)); -} - -// Symetric in-lane mask. Each lane has 4 elements (for imm8) -static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (EltSize < 32) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - Imm8 = 0; - if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { - for (unsigned i = 0; i != NumElts; ++i) { - if (Mask[i] < 0) - continue; - Imm8 |= Mask[i] << (i*2); - } - return true; - } - - unsigned LaneSize = 4; - SmallVector<int, 4> MaskVal(LaneSize, -1); - - for (unsigned l = 0; l != NumElts; l += LaneSize) { - for (unsigned i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) - return false; - if (Mask[i+l] < 0) - continue; - if (MaskVal[i] < 0) { - MaskVal[i] = Mask[i+l] - l; - Imm8 |= MaskVal[i] << (i*2); - continue; - } - if (Mask[i+l] != (signed)(MaskVal[i]+l)) - return false; - } - } - return true; -} - -/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. -static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - if (VT.getSizeInBits() < 256 || EltSize < 32) - return false; - bool symetricMaskRequired = (EltSize == 32); - unsigned NumElts = VT.getVectorNumElements(); - - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned LaneSize = NumElts/NumLanes; - // 2 or 4 elements in one lane - - SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1); - for (unsigned l = 0; l != NumElts; l += LaneSize) { - for (unsigned i = 0; i != LaneSize; ++i) { - if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) - return false; - if (symetricMaskRequired) { - if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { - ExpectedMaskVal[i] = Mask[i+l] - l; - continue; - } - if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) - return false; - } - } - } - return true; -} - -/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse -/// of what x86 movss want. X86 movs requires the lowest element to be lowest -/// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT, - bool V2IsSplat = false, bool V2IsUndef = false) { - if (!VT.is128BitVector()) - return false; - - unsigned NumOps = VT.getVectorNumElements(); - if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) - return false; - - if (!isUndefOrEqual(Mask[0], 0)) - return false; - - for (unsigned i = 1; i != NumOps; ++i) - if (!(isUndefOrEqual(Mask[i], i+NumOps) || - (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || - (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) - return false; - - return true; -} - -/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. -/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8) || - (VT.is512BitVector() && NumElems != 16)) - return false; - - // "i+1" is the value the indexed mask element must have - for (unsigned i = 0; i != NumElems; i += 2) - if (!isUndefOrEqual(Mask[i], i+1) || - !isUndefOrEqual(Mask[i+1], i+1)) - return false; - - return true; -} - -/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. -/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3()) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if ((VT.is128BitVector() && NumElems != 4) || - (VT.is256BitVector() && NumElems != 8) || - (VT.is512BitVector() && NumElems != 16)) - return false; - - // "i" is the value the indexed mask element must have - for (unsigned i = 0; i != NumElems; i += 2) - if (!isUndefOrEqual(Mask[i], i) || - !isUndefOrEqual(Mask[i+1], i)) - return false; - - return true; -} - -/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// version of MOVDDUP. -static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) { - if (!HasFp256 || !VT.is256BitVector()) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - if (NumElts != 4) - return false; - - for (unsigned i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(Mask[i], 0)) - return false; - for (unsigned i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(Mask[i], NumElts/2)) - return false; - return true; -} - -/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit -/// version of MOVDDUP. -static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - unsigned e = VT.getVectorNumElements() / 2; - for (unsigned i = 0; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - for (unsigned i = 0; i != e; ++i) - if (!isUndefOrEqual(Mask[e+i], i)) - return false; - return true; -} - /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors @@ -4835,125 +3953,6 @@ bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } -/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -/// Handles 128-bit and 256-bit. -static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT.getSizeInBits() >= 128) && - "Unsupported vector type for PSHUF/SHUFP"); - - // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate - // independently on 128-bit lanes. - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && - "Only supports 2, 4 or 8 elements per lane"); - - unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; - unsigned Mask = 0; - for (unsigned i = 0; i != NumElts; ++i) { - int Elt = N->getMaskElt(i); - if (Elt < 0) continue; - Elt &= NumLaneElts - 1; - unsigned ShAmt = (i << Shift) % 8; - Mask |= Elt << ShAmt; - } - - return Mask; -} - -/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. -static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT == MVT::v8i16 || VT == MVT::v16i16) && - "Unsupported vector type for PSHUFHW"); - - unsigned NumElts = VT.getVectorNumElements(); - - unsigned Mask = 0; - for (unsigned l = 0; l != NumElts; l += 8) { - // 8 nodes per lane, but we only care about the last 4. - for (unsigned i = 0; i < 4; ++i) { - int Elt = N->getMaskElt(l+i+4); - if (Elt < 0) continue; - Elt &= 0x3; // only 2-bits. - Mask |= Elt << (i * 2); - } - } - - return Mask; -} - -/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. -static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { - MVT VT = N->getSimpleValueType(0); - - assert((VT == MVT::v8i16 || VT == MVT::v16i16) && - "Unsupported vector type for PSHUFHW"); - - unsigned NumElts = VT.getVectorNumElements(); - - unsigned Mask = 0; - for (unsigned l = 0; l != NumElts; l += 8) { - // 8 nodes per lane, but we only care about the first 4. - for (unsigned i = 0; i < 4; ++i) { - int Elt = N->getMaskElt(l+i); - if (Elt < 0) continue; - Elt &= 0x3; // only 2-bits - Mask |= Elt << (i * 2); - } - } - - return Mask; -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with -/// VALIGN (if Interlane is true) instructions. -static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, - bool InterLane) { - MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = InterLane ? 1 : - VT.getVectorElementType().getSizeInBits() >> 3; - - unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; - unsigned NumLaneElts = NumElts/NumLanes; - - int Val = 0; - unsigned i; - for (i = 0; i != NumElts; ++i) { - Val = SVOp->getMaskElt(i); - if (Val >= 0) - break; - } - if (Val >= (int)NumElts) - Val -= NumElts - NumLaneElts; - - assert(Val - i > 0 && "PALIGNR imm should be positive"); - return (Val - i) * EltSize; -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the PALIGNR instruction. -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { - return getShuffleAlignrImmediate(SVOp, false); -} - -/// \brief Return the appropriate immediate to shuffle the specified -/// VECTOR_SHUFFLE mask with the VALIGN instruction. -static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { - return getShuffleAlignrImmediate(SVOp, true); -} - - static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -5028,119 +4027,6 @@ bool X86::isZeroNode(SDValue Elt) { return false; } -/// ShouldXformToMOVHLPS - Return true if the node should be transformed to -/// match movhlps. The lower half elements should come from upper half of -/// V1 (and in order), and the upper half elements should come from the upper -/// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - if (VT.getVectorNumElements() != 4) - return false; - for (unsigned i = 0, e = 2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i+2)) - return false; - for (unsigned i = 2; i != 4; ++i) - if (!isUndefOrEqual(Mask[i], i+4)) - return false; - return true; -} - -/// isScalarLoadToVector - Returns true if the node is a scalar load that -/// is promoted to a vector. It also returns the LoadSDNode by reference if -/// required. -static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { - if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) - return false; - N = N->getOperand(0).getNode(); - if (!ISD::isNON_EXTLoad(N)) - return false; - if (LD) - *LD = cast<LoadSDNode>(N); - return true; -} - -// Test whether the given value is a vector value which will be legalized -// into a load. -static bool WillBeConstantPoolLoad(SDNode *N) { - if (N->getOpcode() != ISD::BUILD_VECTOR) - return false; - - // Check for any non-constant elements. - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) - switch (N->getOperand(i).getNode()->getOpcode()) { - case ISD::UNDEF: - case ISD::ConstantFP: - case ISD::Constant: - break; - default: - return false; - } - - // Vectors of all-zeros and all-ones are materialized with special - // instructions rather than being loaded. - return !ISD::isBuildVectorAllZeros(N) && - !ISD::isBuildVectorAllOnes(N); -} - -/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to -/// match movlp{s|d}. The lower half elements should come from lower half of -/// V1 (and in order), and the upper half elements should come from the upper -/// half of V2 (and in order). And since V1 will become the source of the -/// MOVLP, it must be either a vector load or a scalar load to vector. -static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ArrayRef<int> Mask, MVT VT) { - if (!VT.is128BitVector()) - return false; - - if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) - return false; - // Is V2 is a vector load, don't do this transformation. We will try to use - // load folding shufps op. - if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) - return false; - - unsigned NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i)) - return false; - for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) - if (!isUndefOrEqual(Mask[i], i+NumElems)) - return false; - return true; -} - -/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved -/// to an zero vector. -/// FIXME: move to dag combiner / method on ShuffleVectorSDNode -static bool isZeroShuffle(ShuffleVectorSDNode *N) { - SDValue V1 = N->getOperand(0); - SDValue V2 = N->getOperand(1); - unsigned NumElems = N->getValueType(0).getVectorNumElements(); - for (unsigned i = 0; i != NumElems; ++i) { - int Idx = N->getMaskElt(i); - if (Idx >= (int)NumElems) { - unsigned Opc = V2.getOpcode(); - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) - continue; - if (Opc != ISD::BUILD_VECTOR || - !X86::isZeroNode(V2.getOperand(Idx-NumElems))) - return false; - } else if (Idx >= 0) { - unsigned Opc = V1.getOpcode(); - if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) - continue; - if (Opc != ISD::BUILD_VECTOR || - !X86::isZeroNode(V1.getOperand(Idx))) - return false; - } - } - return true; -} - /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, @@ -5152,33 +4038,37 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 - SDValue Cst = DAG.getConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 - SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 - SDValue Cst = DAG.getConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. - SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 - SDValue Cst = DAG.getConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { - assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); - SDValue Cst = DAG.getConstant(0, MVT::i1); - SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + + assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) + && "Unexpected vector type"); + assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) + && "Unexpected vector type"); + SDValue Cst = DAG.getConstant(0, dl, MVT::i1); + SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); @@ -5186,6 +4076,162 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits()/vectorWidth; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(ResultVT); + + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) + * ElemsPerChunk); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, + ElemsPerChunk)); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); +} + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); +} + +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + + // For insertion into the zero index (low half) of a 256-bit vector, it is + // more efficient to generate a blend with immediate instead of an insert*128. + // We are still creating an INSERT_SUBVECTOR below with an undef node to + // extend the subvector to the size of the result vector. Make sure that + // we are not recursing on that node by checking for undef here. + if (IdxVal == 0 && Result.getValueType().is256BitVector() && + Result.getOpcode() != ISD::UNDEF) { + EVT ResultVT = Result.getValueType(); + SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(ResultVT); + SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, + Vec, ZeroIndex); + + // The blend instruction, and therefore its mask, depend on the data type. + MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + if (ScalarType.isFloatingPoint()) { + // Choose either vblendps (float) or vblendpd (double). + unsigned ScalarSize = ScalarType.getSizeInBits(); + assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); + unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; + SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); + } + + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + + // AVX2 is needed for 256-bit integer blend support. + // Integers must be cast to 32-bit because there is only vpblendd; + // vpblendw can't be used for this because it has a handicapped mask. + + // If we don't have AVX2, then cast to float. Using a wrong domain blend + // is still more efficient than using the wrong domain vinsertf128 that + // will be created by InsertSubVector(). + MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; + + SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256); + Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); + return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256); + } + + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); +} + +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + /// getOnesVector - Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. @@ -5194,7 +4240,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); - SDValue Cst = DAG.getConstant(~0U, MVT::i32); + SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 @@ -5212,16 +4258,6 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } -/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements -/// that point to V2 points to its first element. -static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { - for (unsigned i = 0; i != NumElems; ++i) { - if (Mask[i] > (int)NumElems) { - Mask[i] = NumElems; - } - } -} - /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, @@ -5258,92 +4294,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by -// a generic shuffle instruction because the target has no such instructions. -// Generate shuffles which repeat i16 and i8 several times until they can be -// represented by v4f32 and then be manipulated by target suported shuffles. -static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { - MVT VT = V.getSimpleValueType(); - int NumElems = VT.getVectorNumElements(); - SDLoc dl(V); - - while (NumElems > 4) { - if (EltNo < NumElems/2) { - V = getUnpackl(DAG, dl, VT, V, V); - } else { - V = getUnpackh(DAG, dl, VT, V, V); - EltNo -= NumElems/2; - } - NumElems >>= 1; - } - return V; -} - -/// getLegalSplat - Generate a legal splat with supported x86 shuffles -static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { - MVT VT = V.getSimpleValueType(); - SDLoc dl(V); - - if (VT.is128BitVector()) { - V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); - int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; - V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), - &SplatMask[0]); - } else if (VT.is256BitVector()) { - // To use VPERMILPS to splat scalars, the second half of indicies must - // refer to the higher part, which is a duplication of the lower one, - // because VPERMILPS can only handle in-lane permutations. - int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, - EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; - - V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); - V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), - &SplatMask[0]); - } else - llvm_unreachable("Vector size not supported"); - - return DAG.getNode(ISD::BITCAST, dl, VT, V); -} - -/// PromoteSplat - Splat is promoted to target supported vector shuffles. -static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { - MVT SrcVT = SV->getSimpleValueType(0); - SDValue V1 = SV->getOperand(0); - SDLoc dl(SV); - - int EltNo = SV->getSplatIndex(); - int NumElems = SrcVT.getVectorNumElements(); - bool Is256BitVec = SrcVT.is256BitVector(); - - assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && - "Unknown how to promote splat for type"); - - // Extract the 128-bit part containing the splat element and update - // the splat element index when it refers to the higher register. - if (Is256BitVec) { - V1 = Extract128BitVector(V1, EltNo, DAG, dl); - if (EltNo >= NumElems/2) - EltNo -= NumElems/2; - } - - // All i16 and i8 vector types can't be used directly by a generic shuffle - // instruction because the target has no such instruction. Generate shuffles - // which repeat i16 and i8 several times until they fit in i32, and then can - // be manipulated by target suported shuffles. - MVT EltVT = SrcVT.getVectorElementType(); - if (EltVT == MVT::i8 || EltVT == MVT::i16) - V1 = PromoteSplati8i16(V1, DAG, EltNo); - - // Recreate the 256-bit vector and place the same 128-bit vector - // into the low and high part. This is necessary because we want - // to use VPERM* to shuffle the vectors - if (Is256BitVec) { - V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); - } - - return getLegalSplat(DAG, V1, EltNo); -} - /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified /// vector of zero or undef vector. This produces a shuffle where the low /// element of V2 is swizzled into the zero/undef vector, landing at element @@ -5467,7 +4417,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return false; SDValue Ptr = MaskLoad->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper) + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); @@ -5489,16 +4440,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, IsUnary = true; break; case X86ISD::MOVSS: - case X86ISD::MOVSD: { - // The index 0 always comes from the first element of the second source, - // this is why MOVSS and MOVSD are used in the first place. The other - // elements come from the other positions of the first source vector - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) { - Mask.push_back(i); - } + case X86ISD::MOVSD: + DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; - } case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); @@ -5506,11 +4450,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); + IsUnary = true; break; case X86ISD::MOVDDUP: + DecodeMOVDDUPMask(VT, Mask); + IsUnary = true; + break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: @@ -5594,148 +4543,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// getNumOfConsecutiveZeros - Return the number of elements of a vector -/// shuffle operation which come from a consecutively from a zero. The -/// search can start in two different directions, from left or right. -/// We count undefs as zeros until PreferredNum is reached. -static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, - unsigned NumElems, bool ZerosFromLeft, - SelectionDAG &DAG, - unsigned PreferredNum = -1U) { - unsigned NumZeros = 0; - for (unsigned i = 0; i != NumElems; ++i) { - unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; - SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); - if (!Elt.getNode()) - break; - - if (X86::isZeroNode(Elt)) - ++NumZeros; - else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. - NumZeros = std::min(NumZeros + 1, PreferredNum); - else - break; - } - - return NumZeros; -} - -/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) -/// correspond consecutively to elements from one of the vector operands, -/// starting from its index OpIdx. Also tell OpNum which source vector operand. -static -bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, - unsigned MaskI, unsigned MaskE, unsigned OpIdx, - unsigned NumElems, unsigned &OpNum) { - bool SeenV1 = false; - bool SeenV2 = false; - - for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { - int Idx = SVOp->getMaskElt(i); - // Ignore undef indicies - if (Idx < 0) - continue; - - if (Idx < (int)NumElems) - SeenV1 = true; - else - SeenV2 = true; - - // Only accept consecutive elements from the same vector - if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) - return false; - } - - OpNum = SeenV1 ? 0 : 1; - return true; -} - -/// isVectorShiftRight - Returns true if the shuffle can be implemented as a -/// logical left shift of a vector. -static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = - SVOp->getSimpleValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros( - SVOp, NumElems, false /* check zeros from right */, DAG, - SVOp->getMaskElt(0)); - unsigned OpSrc; - - if (!NumZeros) - return false; - - // Considering the elements in the mask that are not consecutive zeros, - // check if they consecutively come from only one of the source vectors. - // - // V1 = {X, A, B, C} 0 - // \ \ \ / - // vector_shuffle V1, V2 <1, 2, 3, X> - // - if (!isShuffleMaskConsecutive(SVOp, - 0, // Mask Start Index - NumElems-NumZeros, // Mask End Index(exclusive) - NumZeros, // Where to start looking in the src vector - NumElems, // Number of elements in vector - OpSrc)) // Which source operand ? - return false; - - isLeft = false; - ShAmt = NumZeros; - ShVal = SVOp->getOperand(OpSrc); - return true; -} - -/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a -/// logical left shift of a vector. -static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - unsigned NumElems = - SVOp->getSimpleValueType(0).getVectorNumElements(); - unsigned NumZeros = getNumOfConsecutiveZeros( - SVOp, NumElems, true /* check zeros from left */, DAG, - NumElems - SVOp->getMaskElt(NumElems - 1) - 1); - unsigned OpSrc; - - if (!NumZeros) - return false; - - // Considering the elements in the mask that are not consecutive zeros, - // check if they consecutively come from only one of the source vectors. - // - // 0 { A, B, X, X } = V2 - // / \ / / - // vector_shuffle V1, V2 <X, X, 4, 5> - // - if (!isShuffleMaskConsecutive(SVOp, - NumZeros, // Mask Start Index - NumElems, // Mask End Index(exclusive) - 0, // Where to start looking in the src vector - NumElems, // Number of elements in vector - OpSrc)) // Which source operand ? - return false; - - isLeft = true; - ShAmt = NumZeros; - ShVal = SVOp->getOperand(OpSrc); - return true; -} - -/// isVectorShift - Returns true if the shuffle can be implemented as a -/// logical left or right shift of a vector. -static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, - bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { - // Although the logic below support any bitwidth size, there are no - // shift instructions which handle more than 128-bit vectors. - if (!SVOp->getSimpleValueType(0).is128BitVector()) - return false; - - if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || - isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) - return true; - - return false; -} - /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, @@ -5749,6 +4556,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, SDLoc dl(Op); SDValue V; bool First = true; + + // SSE4.1 - use PINSRB to insert each byte directly. + if (Subtarget->hasSSE41()) { + for (unsigned i = 0; i < 16; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v16i8); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v16i8, V, Op.getOperand(i), + DAG.getIntPtrConstant(i, dl)); + } + } + + return V; + } + + // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { @@ -5769,7 +4599,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, - ThisElt, DAG.getConstant(8, MVT::i8)); + ThisElt, DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else @@ -5777,7 +4607,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (ThisElt.getNode()) V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i/2)); + DAG.getIntPtrConstant(i/2, dl)); } } @@ -5809,7 +4639,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), - DAG.getIntPtrConstant(i)); + DAG.getIntPtrConstant(i, dl)); } } @@ -5821,13 +4651,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. - bool Zeroable[4]; + std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); } - assert(std::count_if(&Zeroable[0], &Zeroable[4], - [](bool M) { return !M; }) > 1 && + assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either @@ -5913,31 +4742,29 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. - unsigned ZMask = 0; - for (int i = 0; i < 4; ++i) - if (Zeroable[i]) - ZMask |= 1 << i; + unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); - SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, - DAG.getIntPtrConstant(InsertPSMask)); - return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); + SDLoc DL(Op); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask, DL)); + return DAG.getNode(ISD::BITCAST, DL, VT, Result); } -/// getVShift - Return a vector logical shift node. -/// +/// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); - EVT ShVT = MVT::v2i64; + MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType()); + assert(NumBits % 8 == 0 && "Only support byte sized shifts"); + SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(Opc, dl, ShVT, SrcOp, - DAG.getConstant(NumBits, - TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); + DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue @@ -5992,9 +4819,11 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~(RequiredAlign-1); - if (StartOffset) - Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), - Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); + if (StartOffset) { + SDLoc DL(Ptr); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(StartOffset, DL, Ptr.getValueType())); + } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); @@ -6004,9 +4833,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); - SmallVector<int, 8> Mask; - for (unsigned i = 0; i != NumElems; ++i) - Mask.push_back(EltNo); + SmallVector<int, 8> Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } @@ -6014,19 +4841,18 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); } -/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a -/// vector of type 'VT', see if the elements can be replaced by a single large -/// load which has the same value as a build_vector whose operands are 'elts'. +/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the +/// elements can be replaced by a single large load which has the same value as +/// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. -static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, +static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { - EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; @@ -6037,7 +4863,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; - + // Look through a bitcast. + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) + Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); @@ -6052,7 +4880,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, continue; LoadSDNode *LD = cast<LoadSDNode>(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + EVT LdVT = Elt.getValueType(); + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) + return SDValue(); + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } @@ -6061,6 +4894,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + assert(LDBase && "Did not find base load for merging consecutive loads"); + EVT EltVT = LDBase->getValueType(0); + // Ensure that the input vector size for the merged loads matches the + // cumulative size of the input elements. + if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) + return SDValue(); if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) @@ -6087,6 +4926,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. + EVT EltVT = VT.getVectorElementType(); if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -6212,8 +5052,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -6377,95 +5216,117 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), - DAG.getIntPtrConstant(Idx)); + DAG.getIntPtrConstant(Idx, DL)); } return NV; } +static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { + assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && + Op.getScalarValueSizeInBits() == 1 && + "Can not convert non-constant vector"); + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() != ISD::UNDEF) + Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + } + SDLoc dl(Op); + MVT VT = + MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); + return DAG.getConstant(Immediate, dl, VT); +} // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); - assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && + assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { - SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { - SDValue Cst = DAG.getTargetConstant(1, MVT::i1); + SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } - bool AllContants = true; + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, dl, VT, Imm); + SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + + // Vector has one or more non-const elements uint64_t Immediate = 0; - int NonConstIdx = -1; + SmallVector<unsigned, 16> NonConstIdx; bool IsSplat = true; - unsigned NumNonConsts = 0; - unsigned NumConsts = 0; + bool HasConstElts = false; + int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; - if (!isa<ConstantSDNode>(In)) { - AllContants = false; - NonConstIdx = idx; - NumNonConsts++; - } else { - NumConsts++; - if (cast<ConstantSDNode>(In)->getZExtValue()) - Immediate |= (1ULL << idx); + if (!isa<ConstantSDNode>(In)) + NonConstIdx.push_back(idx); + else { + Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + HasConstElts = true; } - if (In != Op.getOperand(0)) + if (SplatIdx == -1) + SplatIdx = idx; + else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } - if (AllContants) { - SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, - DAG.getConstant(Immediate, MVT::i16)); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, - DAG.getIntPtrConstant(0)); + // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" + if (IsSplat) + return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); + + // insert elements one by one + SDValue DstVec; + SDValue Imm; + if (Immediate) { + MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); + Imm = DAG.getConstant(Immediate, dl, ImmVT); + } + else if (HasConstElts) + Imm = DAG.getConstant(0, dl, VT); + else + Imm = DAG.getUNDEF(VT); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + DstVec = DAG.getNode(ISD::BITCAST, dl, VT, Imm); + else { + SDValue ExtVec = DAG.getNode(ISD::BITCAST, dl, MVT::v8i1, Imm); + DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); } - if (NumNonConsts == 1 && NonConstIdx != 0) { - SDValue DstVec; - if (NumConsts) { - SDValue VecAsImm = DAG.getConstant(Immediate, - MVT::getIntegerVT(VT.getSizeInBits())); - DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); - } - else - DstVec = DAG.getUNDEF(VT); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, - Op.getOperand(NonConstIdx), - DAG.getIntPtrConstant(NonConstIdx)); - } - if (!IsSplat && (NonConstIdx != 0)) - llvm_unreachable("Unsupported BUILD_VECTOR operation"); - MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; - SDValue Select; - if (IsSplat) - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), - DAG.getConstant(-1, SelectVT), - DAG.getConstant(0, SelectVT)); - else - Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), - DAG.getConstant((Immediate | 1), SelectVT), - DAG.getConstant(Immediate, SelectVT)); - return DAG.getNode(ISD::BITCAST, dl, VT, Select); + for (unsigned i = 0; i < NonConstIdx.size(); ++i) { + unsigned InsertIdx = NonConstIdx[i]; + DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(InsertIdx), + DAG.getIntPtrConstant(InsertIdx, dl)); + } + return DstVec; } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// -/// This is a helper function of PerformBUILD_VECTORCombine. +/// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. @@ -6528,11 +5389,17 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { - if (V0.getOpcode() == ISD::UNDEF) + if (V0.getOpcode() == ISD::UNDEF) { V0 = Op0.getOperand(0); + if (V0.getValueType() != VT) + return false; + } } else { - if (V1.getOpcode() == ISD::UNDEF) + if (V1.getOpcode() == ISD::UNDEF) { V1 = Op0.getOperand(0); + if (V1.getValueType() != VT) + return false; + } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } @@ -6556,7 +5423,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// -/// This is a helper function of PerformBUILD_VECTORCombine. +/// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two @@ -6622,12 +5489,16 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } -/// \brief Try to fold a build_vector that performs an 'addsub' into the -/// sequence of 'vadd + vsub + blendi'. -static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc DL(BV); +/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB +/// node. +static SDValue LowerToAddSub(const BuildVectorSDNode *BV, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT VT = BV->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); + + SDLoc DL(BV); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); @@ -6644,7 +5515,7 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, bool AddFound = false; bool SubFound = false; - for (unsigned i = 0, e = NumElts; i != e; i++) { + for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. @@ -6682,10 +5553,16 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, SubFound = true; // Update InVec0 and InVec1. - if (InVec0.getOpcode() == ISD::UNDEF) + if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); - if (InVec1.getOpcode() == ISD::UNDEF) + if (InVec0.getValueType() != VT) + return SDValue(); + } + if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); + if (InVec1.getValueType() != VT) + return SDValue(); + } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. @@ -6715,23 +5592,12 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc DL(N); - EVT VT = N->getValueType(0); +/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. +static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = BV->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); - BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); - SDValue InVec0, InVec1; - - // Try to match an ADDSUB. - if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { - SDValue Value = matchAddSub(BV, DAG, Subtarget); - if (Value.getNode()) - return Value; - } - - // Try to match horizontal ADD/SUB. unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; @@ -6750,6 +5616,8 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); + SDLoc DL(BV); + SDValue InVec0, InVec1; if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) @@ -6894,8 +5762,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); - if (Broadcast.getNode()) + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); + if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) + return AddSub; + if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) + return HorizontalOp; + if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -6941,32 +5813,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); EVT VecVT = MVT::v4i32; - unsigned VecElts = 4; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - - // If using the new shuffle lowering, just directly insert this. - if (ExperimentalVectorShuffleLowering) - return DAG.getNode( - ISD::BITCAST, dl, VT, - getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); - - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - - // Now we have our 32-bit value zero extended in the low element of - // a vector. If Idx != 0, swizzle it into place. - if (Idx != 0) { - SmallVector<int, 4> Mask; - Mask.push_back(Idx); - for (unsigned i = 1; i != VecElts; ++i) - Mask.push_back(i); - Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), - &Mask[0]); - } - return DAG.getNode(ISD::BITCAST, dl, VT, Item); + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); } } @@ -6980,25 +5834,36 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.is256BitVector() || VT.is512BitVector()) { + if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, - Item, DAG.getIntPtrConstant(0)); + Item, DAG.getIntPtrConstant(0, dl)); } - assert(VT.is128BitVector() && "Expected an SSE value type!"); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } + // We can't directly insert an i8 or i16 into a vector, so zero extend + // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); if (VT.is256BitVector()) { - SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); - Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + if (Subtarget->hasAVX()) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } else { + // Without AVX, we need to extend to a 128-bit vector and then + // insert into the 256-bit vector. + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); @@ -7026,17 +5891,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - - // If using the new shuffle lowering, just directly insert this. - if (ExperimentalVectorShuffleLowering) - return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); - - // Turn it into a shuffle of zero and zero-extended scalar to vector. - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); - SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i != NumElems; ++i) - MaskVec.push_back(i == Idx ? 0 : 1); - return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } @@ -7064,9 +5919,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { - SmallVector<SDValue, 64> V; - for (unsigned i = 0; i != NumElems; ++i) - V.push_back(Op.getOperand(i)); + SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); // Check for a build vector of consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) @@ -7099,24 +5952,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // If element VT is < 32 bits, convert it to inserts into a zero vector. - if (EVTBits == 8 && NumElems == 16) { - SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 8 && NumElems == 16) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; - if (EVTBits == 16 && NumElems == 8) { - SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this); - if (V.getNode()) return V; - } + if (EVTBits == 16 && NumElems == 8) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + Subtarget, *this)) + return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS - if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); - if (V.getNode()) + if (EVTBits == 32 && NumElems == 4) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) return V; - } // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector<SDValue, 8> V(NumElems); @@ -7164,17 +6013,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); - if (LD.getNode()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; // Check for a build vector from mostly shuffle plus few inserting. - SDValue Sh = buildFromShuffleMostly(Op, DAG); - if (Sh.getNode()) + if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41()) { + if (Subtarget->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -7184,7 +6031,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, - Op.getOperand(i), DAG.getIntPtrConstant(i)); + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } @@ -7236,7 +6083,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); - if(ResVT.is256BitVector()) + if (ResVT.is256BitVector()) return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { @@ -7250,8 +6097,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); +static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG & DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + unsigned NumOfOperands = Op.getNumOperands(); + + assert(isPowerOf2_32(NumOfOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (NumOfOperands > 2) { + MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + ResVT.getVectorNumElements()/2); + SmallVector<SDValue, 2> Ops; + for (unsigned i = 0; i < NumOfOperands/2; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + Ops.clear(); + for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); + } + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); + + if (IsZeroV1 && IsZeroV2) + return getZeroVector(ResVT, Subtarget, DAG, dl); + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(ResVT); + unsigned NumElems = ResVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); + V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); + if (IsZeroV1) + return V2; + + V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + // Zero the upper bits of V1 + V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); + V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); + if (IsZeroV2) + return V1; + return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() == MVT::i1) + return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); + assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); @@ -7354,38 +6257,40 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return true; } -// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC -// 2013 will allow us to use it as a non-type template parameter. -namespace { - -/// \brief Implementation of the \c isShuffleEquivalent variadic functor. -/// -/// See its documentation for details. -bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { - if (Mask.size() != Args.size()) - return false; - for (int i = 0, e = Mask.size(); i < e; ++i) { - assert(*Args[i] >= 0 && "Arguments must be positive integers!"); - if (Mask[i] != -1 && Mask[i] != *Args[i]) - return false; - } - return true; -} - -} // namespace - /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// -/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } +/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. -static const VariadicFunction1< - bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; +static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, + ArrayRef<int> ExpectedMask) { + if (Mask.size() != ExpectedMask.size()) + return false; + + int Size = Mask.size(); + + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); + auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); + + for (int i = 0; i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { + auto *MaskBV = Mask[i] < Size ? BV1 : BV2; + auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; + if (!MaskBV || !ExpectedBV || + MaskBV->getOperand(Mask[i] % Size) != + ExpectedBV->getOperand(ExpectedMask[i] % Size)) + return false; + } + + return true; +} /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// @@ -7395,7 +6300,7 @@ static const VariadicFunction1< /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, SelectionDAG &DAG) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); @@ -7408,7 +6313,39 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; - return DAG.getConstant(Imm, MVT::i8); + return DAG.getConstant(Imm, DL, MVT::i8); +} + +/// \brief Try to emit a blend instruction for a shuffle using bit math. +/// +/// This is used as a fallback approach when first class blend instructions are +/// unavailable. Currently it is only suitable for integer vectors, but could +/// be generalized for floating point vectors if desirable. +static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.isInteger() && "Only supports integer vector types!"); + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + SDValue Zero = DAG.getConstant(0, DL, EltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + EltVT); + SmallVector<SDValue, 16> MaskOps; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) + return SDValue(); // Shuffled input! + MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); + } + + SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); + V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); + // We have to cast V2 around. + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::ANDNP, DL, MaskVT, + DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask), + DAG.getNode(ISD::BITCAST, DL, MaskVT, V2))); + return DAG.getNode(ISD::OR, DL, VT, V1, V2); } /// \brief Try to emit a blend instruction for a shuffle. @@ -7421,7 +6358,6 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= Size) { @@ -7439,7 +6375,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, case MVT::v4f64: case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); + DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: @@ -7463,7 +6399,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); + DAG.getConstant(BlendMask, DL, MVT::i8))); } // FALLTHROUGH case MVT::v8i16: { @@ -7480,7 +6416,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8))); + DAG.getConstant(BlendMask, DL, MVT::i8))); } case MVT::v16i16: { @@ -7494,15 +6430,21 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, if (RepeatedMask[i] >= 16) BlendMask |= 1u << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); + DAG.getConstant(BlendMask, DL, MVT::i8)); } } // FALLTHROUGH + case MVT::v16i8: case MVT::v32i8: { - assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + "256-bit byte-blends require AVX2 support!"); + // Scale the blend by the number of bytes per element. - int Scale = VT.getScalarSizeInBits() / 8; - assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + int Scale = VT.getScalarSizeInBits() / 8; + + // This form of blend is always done on bytes. Compute the byte vector + // type. + MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code @@ -7515,19 +6457,20 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. - SDValue VSELECTMask[32]; + SmallVector<SDValue, 32> VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) - VSELECTMask[Scale * i + j] = + VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, + MVT::i8)); - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); return DAG.getNode( ISD::BITCAST, DL, VT, - DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), V1, V2)); } @@ -7536,12 +6479,45 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, } } -/// \brief Generic routine to lower a shuffle and blend as a decomposed set of -/// unblended shuffles followed by an unshuffled blend. +/// \brief Try to lower as a blend of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can blend elements from two inputs and +/// then reduce the shuffle to a single-input permutation. +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // We build up the blend mask while checking whether a blend is a viable way + // to reduce the shuffle. + SmallVector<int, 32> BlendMask(Mask.size(), -1); + SmallVector<int, 32> PermuteMask(Mask.size(), -1); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); + + if (BlendMask[Mask[i] % Size] == -1) + BlendMask[Mask[i] % Size] = Mask[i]; + else if (BlendMask[Mask[i] % Size] != Mask[i]) + return SDValue(); // Can't blend in the needed input! + + PermuteMask[i] = Mask[i] % Size; + } + + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); +} + +/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend -/// operations. +/// operations. It will try to pick the best arrangement of shuffles and +/// blends. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, @@ -7561,6 +6537,16 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, BlendMask[i] = i + Size; } + // Try to lower with the simpler initial blend strategy unless one of the + // input shuffles would be a no-op. We prefer to shuffle inputs as the + // shuffle may be able to fold with a load or other benefit. However, when + // we'll have to do 2x as many shuffles in order to achieve this, blending + // first is a better strategy. + if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) + if (SDValue BlendPerm = + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + return BlendPerm; + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); @@ -7582,8 +6568,6 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -/// -/// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, @@ -7591,6 +6575,10 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] @@ -7600,44 +6588,52 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] == -1) - continue; - assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < NumLaneElts; ++i) { + if (Mask[l + i] == -1) + continue; + assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); - // Based on the mod-Size value of this mask element determine where - // a rotated vector would have started. - int StartIdx = i - (Mask[i] % Size); - if (StartIdx == 0) - // The identity rotation isn't interesting, stop. - return SDValue(); + // Get the mod-Size index and lane correct it. + int LaneIdx = (Mask[l + i] % NumElts) - l; + // Make sure it was in this lane. + if (LaneIdx < 0 || LaneIdx >= NumLaneElts) + return SDValue(); - // If we found the tail of a vector the rotation must be the missing - // front. If we found the head of a vector, it must be how much of the head. - int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; + // Determine where a rotated vector would have started. + int StartIdx = i - LaneIdx; + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); - if (Rotation == 0) - Rotation = CandidateRotation; - else if (Rotation != CandidateRotation) - // The rotations don't match, so we can't match this mask. - return SDValue(); + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the + // head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; - // Compute which value this mask is pointing at. - SDValue MaskV = Mask[i] < Size ? V1 : V2; - - // Compute which of the two target values this index should be assigned to. - // This reflects whether the high elements are remaining or the low elements - // are remaining. - SDValue &TargetV = StartIdx < 0 ? Hi : Lo; - - // Either set up this value if we've not encountered it before, or check - // that it remains consistent. - if (!TargetV) - TargetV = MaskV; - else if (TargetV != MaskV) - // This may be a rotation, but it pulls from the inputs in some - // unsupported interleaving. - return SDValue(); + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; + + // Compute which of the two target values this index should be assigned + // to. This reflects whether the high elements are remaining or the low + // elements are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } } // Check that we successfully analyzed the mask, and normalize the results. @@ -7648,26 +6644,28 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, else if (!Hi) Hi = Lo; - assert(VT.getSizeInBits() == 128 && - "Rotate-based lowering only supports 128-bit lowering!"); - assert(Mask.size() <= 16 && - "Can shuffle at most 16 bytes in a 128-bit vector!"); - // The actual rotate instruction rotates bytes, so we need to scale the - // rotation based on how many bytes are in the vector. - int Scale = 16 / Mask.size(); + // rotation based on how many bytes are in the vector lane. + int Scale = 16 / NumLaneElts; - // SSSE3 targets can use the palignr instruction + // SSSE3 targets can use the palignr instruction. if (Subtarget->hasSSSE3()) { - // Cast the inputs to v16i8 to match PALIGNR. - Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); - Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + // Cast the inputs to i8 vector of correct length to match PALIGNR. + MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); + Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi); return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, - DAG.getConstant(Rotation * Scale, MVT::i8))); + DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, + DAG.getConstant(Rotation * Scale, DL, + MVT::i8))); } + assert(VT.getSizeInBits() == 128 && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; @@ -7677,9 +6675,9 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, - DAG.getConstant(8 * LoByteShift, MVT::i8)); + DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, - DAG.getConstant(8 * HiByteShift, MVT::i8)); + DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } @@ -7696,6 +6694,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); @@ -7707,10 +6710,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, continue; } - // If this is an index into a build_vector node, dig out the input value and - // use it. + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR) + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) continue; SDValue Input = V.getOperand(M % Size); @@ -7723,78 +6726,135 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, return Zeroable; } -/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). -/// -/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 -/// byte-shift instructions. The mask must consist of a shifted sequential -/// shuffle from one of the input vectors and zeroable elements for the -/// remaining 'shifted in' elements. +/// \brief Try to emit a bitmask instruction for a shuffle. /// -/// Note that this only handles 128-bit vector widths currently. -static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { - assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getScalarType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero); + AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and +/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function +/// matches elements from one of the input vectors shuffled to the left or +/// right with zeroable elements 'shifted in'. It handles both the strictly +/// bit-wise element shifts and the byte shift across an entire 128-bit double +/// quad word lane. +/// +/// PSHL : (little-endian) left bit shift. +/// [ zz, 0, zz, 2 ] +/// [ -1, 4, zz, -1 ] +/// PSRL : (little-endian) right bit shift. +/// [ 1, zz, 3, zz] +/// [ -1, -1, 7, zz] +/// PSLLDQ : (little-endian) left byte shift +/// [ zz, 0, 1, 2, 3, 4, 5, 6] +/// [ zz, zz, -1, -1, 2, 3, 4, -1] +/// [ zz, zz, zz, zz, zz, zz, -1, 1] +/// PSRLDQ : (little-endian) right byte shift +/// [ 5, 6, 7, zz, zz, zz, zz, zz] +/// [ -1, 5, 6, 7, zz, zz, zz, zz] +/// [ 1, 2, -1, -1, -1, -1, zz, zz] +static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Size = Mask.size(); - int Scale = 16 / Size; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); - bool ValidShiftRight2 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } - } + auto CheckZeros = [&](int Shift, int Scale, bool Left) { + for (int i = 0; i < Size; i += Scale) + for (int j = 0; j < Shift; ++j) + if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) + return false; - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } - - if (ZeroableLeft) { - bool ValidShiftLeft1 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); - bool ValidShiftLeft2 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); - - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + return true; + }; + + auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { + for (int i = 0; i != Size; i += Scale) { + unsigned Pos = Left ? i + Shift : i; + unsigned Low = Left ? i : i + Shift; + unsigned Len = Scale - Shift; + if (!isSequentialOrUndefInRange(Mask, Pos, Len, + Low + (V == V1 ? 0 : Size))) + return SDValue(); } - } + int ShiftEltBits = VT.getScalarSizeInBits() * Scale; + bool ByteShift = ShiftEltBits > 64; + unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) + : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); + int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); + + // Normalize the scale for byte shifts to still produce an i64 element + // type. + Scale = ByteShift ? Scale / 2 : Scale; + + // We need to round trip through the appropriate type for the shift. + MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); + MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); + assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && + "Illegal integer vector type"); + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + + V = DAG.getNode(OpCode, DL, ShiftVT, V, + DAG.getConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + + // SSE/AVX supports logical shifts up to 64-bit integers - so we can just + // keep doubling the size of the integer elements up to that. We can + // then shift the elements of the integer vector by whole multiples of + // their width within the elements of the larger integer vector. Test each + // multiple to see if we can find a match with the moved element indices + // and that the shifted in elements are all zeroable. + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) + for (int Shift = 1; Shift != Scale; ++Shift) + for (bool Left : {true, false}) + if (CheckZeros(Shift, Scale, Left)) + for (SDValue V : {V1, V2}) + if (SDValue Match = MatchShift(Shift, Scale, Left, V)) + return Match; + + // no match return SDValue(); } @@ -7804,10 +6864,11 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, /// stride, produce either a zero or any extension based on the available /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, + SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); - int EltBits = VT.getSizeInBits() / NumElements; + int NumElements = VT.getVectorNumElements(); + int EltBits = VT.getScalarSizeInBits(); assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); @@ -7815,10 +6876,8 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { - MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); } @@ -7831,19 +6890,19 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {0, -1, 0, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFHWMask[4] = {1, -1, -1, -1}; return DAG.getNode( ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); } // If this would require more than 2 unpack instructions to expand, use @@ -7854,7 +6913,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) PSHUFBMask[i] = - DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); + DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8); InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, @@ -7876,7 +6935,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( return DAG.getNode(ISD::BITCAST, DL, VT, InputV); } -/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. +/// \brief Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't @@ -7894,7 +6953,10 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); - int NumElements = Mask.size(); + int NumElements = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() <= 32 && + "Exceeds 32-bit integer zero extension limit"); + assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. @@ -7905,11 +6967,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (Mask[i] == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { - // Each of the extend elements needs to be zeroable. + // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); - // We no lorger are in the anyext case. + // We no longer are in the anyext case. AnyExt = false; continue; } @@ -7923,7 +6985,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); // Flip-flopping inputs. if (Mask[i] % NumElements != i / Scale) - return SDValue(); // Non-consecutive strided elemenst. + return SDValue(); // Non-consecutive strided elements. } // If we fail to find an input, we have a zero-shuffle which should always @@ -7933,7 +6995,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); + DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7945,11 +7007,34 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && - "The input vector size must be divisble by the extended size."); + "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } + // General extends failed, but 128-bit vectors may be able to use MOVQ. + if (Bits != 128) + return SDValue(); + + // Returns one of the source operands if the shuffle can be reduced to a + // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. + auto CanZExtLowHalf = [&]() { + for (int i = NumElements / 2; i != NumElements; ++i) + if (!Zeroable[i]) + return SDValue(); + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) + return V1; + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) + return V2; + return SDValue(); + }; + + if (SDValue V = CanZExtLowHalf()) { + V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + } + // No viable ext lowering found. return SDValue(); } @@ -7970,8 +7055,13 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx, return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || - (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) - return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { + // Ensure the scalar operand is the same size as the destination. + // FIXME: Add support for scalar truncation where possible. + SDValue S = V.getOperand(Idx); + if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); + } return SDValue(); } @@ -7992,7 +7082,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( - MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; @@ -8059,6 +7149,10 @@ static SDValue lowerVectorShuffleAsElementInsertion( ExtVT, V1, V2); } + // This lowering only works for the low element with floating point vectors. + if (VT.isFloatingPoint() && V2Index != 0) + return SDValue(); + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); @@ -8077,7 +7171,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant( - V2Index * EltVT.getSizeInBits(), + V2Index * EltVT.getSizeInBits()/8, DL, DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); } @@ -8090,7 +7184,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, +static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { @@ -8111,8 +7205,8 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, "a sorted mask where the broadcast " "comes from V1."); - // Go up the chain of (vector) values to try and find a scalar load that - // we can combine with the broadcast. + // Go up the chain of (vector) values to find a scalar load that we can + // combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { @@ -8149,12 +7243,12 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); - // If the scalar isn't a load we can't broadcast from it in AVX1, only with - // AVX2. + // If the scalar isn't a load, we can't broadcast from it in AVX1. + // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { - // We can't broadcast from a vector register w/o AVX2, and we can only + // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } @@ -8183,7 +7277,7 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, int V2DstIndex = -1; bool V1UsedInPlace = false; - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; @@ -8237,7 +7331,122 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, // Insert the V2 element into the desired position. SDLoc DL(Op); return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, MVT::i8)); + DAG.getConstant(InsertPSMask, DL, MVT::i8)); +} + +/// \brief Try to lower a shuffle as a permute of the inputs followed by an +/// UNPCK instruction. +/// +/// This specifically targets cases where we end up with alternating between +/// the two inputs, and so can permute them into something that feeds a single +/// UNPCK instruction. Note that this routine only targets integer vectors +/// because for floating point vectors we have a generalized SHUFPS lowering +/// strategy that handles everything that doesn't *exactly* match an unpack, +/// making this clever lowering unnecessary. +static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!VT.isFloatingPoint() && + "This routine only supports integer vectors."); + assert(!isSingleInputShuffleMask(Mask) && + "This routine should only be used when blending two inputs."); + assert(Mask.size() >= 2 && "Single element masks are invalid."); + + int Size = Mask.size(); + + int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { + return M >= 0 && M % Size < Size / 2; + }); + int NumHiInputs = std::count_if( + Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); + + bool UnpackLo = NumLoInputs >= NumHiInputs; + + auto TryUnpack = [&](MVT UnpackVT, int Scale) { + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + // Each element of the unpack contains Scale elements from this mask. + int UnpackIdx = i / Scale; + + // We only handle the case where V1 feeds the first slots of the unpack. + // We rely on canonicalization to ensure this is the case. + if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) + return SDValue(); + + // Setup the mask for this input. The indexing is tricky as we have to + // handle the unpack stride. + SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; + VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = + Mask[i] % Size; + } + + // If we will have to shuffle both inputs to use the unpack, check whether + // we can just unpack first and shuffle the result. If so, skip this unpack. + if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && + !isNoopShuffleMask(V2Mask)) + return SDValue(); + + // Shuffle the inputs into place. + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + + // Cast the inputs to the type we will use to unpack them. + V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2); + + // Unpack the inputs and cast the result back to the desired type. + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, + DL, UnpackVT, V1, V2)); + }; + + // We try each unpack from the largest to the smallest to try and find one + // that fits this mask. + int OrigNumElements = VT.getVectorNumElements(); + int OrigScalarSize = VT.getScalarSizeInBits(); + for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { + int Scale = ScalarSize / OrigScalarSize; + int NumElements = OrigNumElements / Scale; + MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); + if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) + return Unpack; + } + + // If none of the unpack-rooted lowerings worked (or were profitable) try an + // initial unpack. + if (NumLoInputs == 0 || NumHiInputs == 0) { + assert((NumLoInputs > 0 || NumHiInputs > 0) && + "We have to have *some* inputs!"); + int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; + + // FIXME: We could consider the total complexity of the permute of each + // possible unpacking. Or at the least we should consider how many + // half-crossings are created. + // FIXME: We could consider commuting the unpacks. + + SmallVector<int, 32> PermMask; + PermMask.assign(Size, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); + + PermMask[i] = + 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); + } + return DAG.getVectorShuffle( + VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, + DL, VT, V1, V2), + DAG.getUNDEF(VT), PermMask); + } + + return SDValue(); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. @@ -8259,6 +7468,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Use low duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) + if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); + // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); @@ -8267,38 +7481,33 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, - DAG.getConstant(SHUFPDMask, MVT::i8)); + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } - return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, - DAG.getConstant(SHUFPDMask, MVT::i8)); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); - // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. - if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. @@ -8312,9 +7521,15 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Subtarget, DAG)) return Blend; + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); - return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, - DAG.getConstant(SHUFPDMask, MVT::i8)); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. @@ -8336,7 +7551,7 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -8349,41 +7564,64 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getNode( ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, - getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); - } + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); + } + assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[0] < 2 && "We sort V1 to be the first input."); + assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + + // If we have a blend of two PACKUS operations an the blend aligns with the + // low and half halves, we can just merge the PACKUS operations. This is + // particularly important as it lets us merge shuffles that this routine itself + // creates. + auto GetPackNode = [](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v2i64, V1, V2, Mask, DAG)) + return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); + }; + if (SDValue V1Pack = GetPackNode(V1)) + if (SDValue V2Pack = GetPackNode(V2)) + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) return Shift; - // If we have a single input from V2 insert that into V1 if we can do so - // cheaply. - if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) - return Insertion; - // Try inverting the insertion since for v2 masks it is easy to do and we - // can't reliably sort the mask one way or the other. - int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), - Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) - return Insertion; - } + // When loading a scalar and then shuffling it into a vector we can often do + // the insertion cheaply. + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) + if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); - if (Subtarget->hasSSE41()) - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, - Subtarget, DAG)) - return Blend; - // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8391,6 +7629,12 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, + Mask, DAG); + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -8401,6 +7645,24 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } +/// \brief Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. @@ -8437,7 +7699,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, - getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. @@ -8476,7 +7738,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - getV4X86ShuffleImm8ForMask(BlendMask, DAG)); + getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. @@ -8488,7 +7750,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, - getV4X86ShuffleImm8ForMask(NewMask, DAG)); + getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// \brief Lower 4-lane 32-bit floating point shuffles. @@ -8512,36 +7774,38 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, Mask, Subtarget, DAG)) return Broadcast; + // Use even/odd duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); + } + if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return V; @@ -8553,8 +7817,23 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; + + if (!isSingleSHUFPSMask(Mask)) + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, MVT::v4f32, V1, V2, Mask, DAG)) + return BlendPerm; } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } @@ -8586,7 +7865,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -8597,37 +7876,48 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; - if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; - else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v4i32, V1, V2, Mask, DAG)) + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return V; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - - if (Subtarget->hasSSE41()) + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Blend; + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); + // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) @@ -8635,6 +7925,17 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, + Mask, DAG); + + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Unpack; + // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we @@ -8658,10 +7959,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. -static SDValue lowerV8I16SingleInputVectorShuffle( - SDLoc DL, SDValue V, MutableArrayRef<int> Mask, +/// +/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each +/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to +/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 +/// vector, form the analogous 128-bit 8-element Mask. +static SDValue lowerV8I16GeneralSingleInputVectorShuffle( + SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); + assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + + assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); @@ -8686,27 +7995,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, - Mask, Subtarget, DAG)) - return Broadcast; - - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V, V, Mask, DAG)) - return Shift; - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); - if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); - - // Try to use byte rotation instructions. - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) - return Rotate; - // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -8811,7 +8099,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle( std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M != -1 && M == FixIdx) @@ -8835,10 +8123,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle( int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, + DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) @@ -8849,8 +8138,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), - Mask); + return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, + DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); @@ -9073,16 +8362,17 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, - DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + V = DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, + DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, + DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. @@ -9095,172 +8385,70 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) - V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(LoMask, DAG)); + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) - V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(HiMask, DAG)); + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } -/// \brief Detect whether the mask pattern should be lowered through -/// interleaving. -/// -/// This essentially tests whether viewing the mask as an interleaving of two -/// sub-sequences reduces the cross-input traffic of a blend operation. If so, -/// lowering it through interleaving is a significantly better strategy. -static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) { - int NumEvenInputs[2] = {0, 0}; - int NumOddInputs[2] = {0, 0}; - int NumLoInputs[2] = {0, 0}; - int NumHiInputs[2] = {0, 0}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] < 0) - continue; - - int InputIdx = Mask[i] >= Size; - - if (i < Size / 2) - ++NumLoInputs[InputIdx]; - else - ++NumHiInputs[InputIdx]; - - if ((i % 2) == 0) - ++NumEvenInputs[InputIdx]; - else - ++NumOddInputs[InputIdx]; - } - - // The minimum number of cross-input results for both the interleaved and - // split cases. If interleaving results in fewer cross-input results, return - // true. - int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], - NumEvenInputs[0] + NumOddInputs[1]); - int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], - NumLoInputs[0] + NumHiInputs[1]); - return InterleavedCrosses < SplitCrosses; -} - -/// \brief Blend two v8i16 vectors using a naive unpack strategy. -/// -/// This strategy only works when the inputs from each vector fit into a single -/// half of that vector, and generally there are not so many inputs as to leave -/// the in-place shuffles required highly constrained (and thus expensive). It -/// shifts all the inputs into a single side of both input vectors and then -/// uses an unpack to interleave these inputs in a single vector. At that -/// point, we will fall back on the generic single input shuffle lowering. -static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, - SDValue V2, - MutableArrayRef<int> Mask, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); - assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); - SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; - for (int i = 0; i < 8; ++i) - if (Mask[i] >= 0 && Mask[i] < 4) - LoV1Inputs.push_back(i); - else if (Mask[i] >= 4 && Mask[i] < 8) - HiV1Inputs.push_back(i); - else if (Mask[i] >= 8 && Mask[i] < 12) - LoV2Inputs.push_back(i); - else if (Mask[i] >= 12) - HiV2Inputs.push_back(i); - - int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); - int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); - (void)NumV1Inputs; - (void)NumV2Inputs; - assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); - assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); - assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); - - bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= - HiV1Inputs.size() + HiV2Inputs.size(); - - auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs, - ArrayRef<int> HiInputs, bool MoveToLo, - int MaskOffset) { - ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs; - ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs; - if (BadInputs.empty()) - return V; - - int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int MoveOffset = MoveToLo ? 0 : 4; +/// \brief Helper to form a PSHUFB-based shuffle+blend. +static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V1Mask[16]; + SDValue V2Mask[16]; + V1InUse = false; + V2InUse = false; - if (GoodInputs.empty()) { - for (int BadInput : BadInputs) { - MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; - Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; - } + int Size = Mask.size(); + int Scale = 16 / Size; + for (int i = 0; i < 16; ++i) { + if (Mask[i / Scale] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { - if (GoodInputs.size() == 2) { - // If the low inputs are spread across two dwords, pack them into - // a single dword. - MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; - MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; - } else { - // Otherwise pin the good inputs. - for (int GoodInput : GoodInputs) - MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; - } - - if (BadInputs.size() == 2) { - // If we have two bad inputs then there may be either one or two good - // inputs fixed in place. Find a fixed input, and then find the *other* - // two adjacent indices by using modular arithmetic. - int GoodMaskIdx = - std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), - [](int M) { return M >= 0; }) - - std::begin(MoveMask); - int MoveMaskIdx = - ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; - assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); - assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; - } else { - assert(BadInputs.size() == 1 && "All sizes handled"); - int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, - std::end(MoveMask), -1) - - std::begin(MoveMask); - MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; - } - } - - return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), - MoveMask); - }; - V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, - /*MaskOffset*/ 0); - V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, - /*MaskOffset*/ 8); - - // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes - // cross-half traffic in the final shuffle. - - // Munge the mask to be a single-input mask after the unpack merges the - // results. - for (int &M : Mask) - if (M != -1) - M = 2 * (M % 4) + (M / 8); + const int ZeroMask = 0x80; + int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale + : ZeroMask; + int V2Idx = Mask[i / Scale] < Size + ? ZeroMask + : (Mask[i / Scale] - Size) * Scale + i % Scale; + if (Zeroable[i / Scale]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + + // If we need shuffled inputs from both, blend the two. + SDValue V; + if (V1InUse && V2InUse) + V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + else + V = V1InUse ? V1 : V2; - return DAG.getVectorShuffle( - MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V1, V2), - DAG.getUNDEF(MVT::v8i16), Mask); + // Cast the result back to the correct type. + return DAG.getNode(ISD::BITCAST, DL, VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. @@ -9297,85 +8485,95 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return ZExt; auto isV1 = [](int M) { return M >= 0 && M < 8; }; + (void)isV1; auto isV2 = [](int M) { return M >= 8; }; - int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); - if (NumV2Inputs == 0) - return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); + if (NumV2Inputs == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, + Mask, Subtarget, DAG)) + return Broadcast; - assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " - "to be V1-input shuffles."); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); + if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v8i16, V1, V2, Mask, DAG)) + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, + Mask, Subtarget, DAG)) + return Rotate; + + return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, + Subtarget, DAG); + } + + assert(std::any_of(Mask.begin(), Mask.end(), isV1) && + "All single-input shuffles should be canonicalized to be V1-input " + "shuffles."); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return V; - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); - - if (Subtarget->hasSSE41()) + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Blend; + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; - if (NumV1Inputs + NumV2Inputs <= 4) - return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); - - // Check whether an interleaving lowering is likely to be more efficient. - // This isn't perfect but it is a strong heuristic that tends to work well on - // the kinds of shuffles that show up in practice. - // - // FIXME: Handle 1x, 2x, and 4x interleaving. - if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. - - int EMask[8], OMask[8]; - for (int i = 0; i < 4; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 4] = -1; - OMask[i + 4] = -1; - } + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return BitBlend; - SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); - SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Unpack; - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); + // If we can't directly blend but can use PSHUFB, that will be better as it + // can both shuffle and set up the inefficient blend. + if (!IsBlendSupported && Subtarget->hasSSSE3()) { + bool V1InUse, V2InUse; + return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, + V1InUse, V2InUse); } - int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - - for (int i = 0; i < 4; ++i) { - LoBlendMask[i] = Mask[i]; - HiBlendMask[i] = Mask[i + 4]; - } - - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); - LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); - HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); - - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); + // We can always bit-blend if we have to so the fallback strategy is to + // decompose into single-input permutes and blends. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, DAG); } /// \brief Check whether a compaction lowering can be done by dropping even @@ -9461,40 +8659,31 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> OrigMask = SVOp->getMask(); - assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Try to use byte shift instructions. - if (SDValue Shift = lowerVectorShuffleAsByteShift( - DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( - DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; - int MaskStorage[16] = { - OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], - OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], - OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], - OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; - MutableArrayRef<int> Mask(MaskStorage); - MutableArrayRef<int> LoMask = Mask.slice(0, 8); - MutableArrayRef<int> HiMask = Mask.slice(8, 8); - int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -9591,36 +8780,17 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; } - // Check whether an interleaving lowering is likely to be more efficient. - // This isn't perfect but it is a strong heuristic that tends to work well on - // the kinds of shuffles that show up in practice. - // - // FIXME: We need to handle other interleaving widths (i16, i32, ...). - if (shouldLowerAsInterleaving(Mask)) { - int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { - return (M >= 0 && M < 8) || (M >= 16 && M < 24); - }); - int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { - return (M >= 8 && M < 16) || M >= 24; - }); - int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1}; - int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1}; - bool UnpackLo = NumLoHalf >= NumHiHalf; - MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8); - MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8); - for (int i = 0; i < 8; ++i) { - TargetEMask[i] = Mask[2 * i]; - TargetOMask[i] = Mask[2 * i + 1]; - } - - SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); - SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - - return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, - MVT::v16i8, Evens, Odds); - } + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {// Low half. + 0, 16, 1, 17, 2, 18, 3, 19, + // High half. + 4, 20, 5, 21, 6, 22, 7, 23})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {// Low half. + 8, 24, 9, 25, 10, 26, 11, 27, + // High half. + 12, 28, 13, 29, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -9636,52 +8806,47 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget->hasSSSE3()) { - SDValue V1Mask[16]; - SDValue V2Mask[16]; bool V1InUse = false; bool V2InUse = false; - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - for (int i = 0; i < 16; ++i) { - if (Mask[i] == -1) { - V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); - } else { - const int ZeroMask = 0x80; - int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask); - int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16); - if (Zeroable[i]) - V1Idx = V2Idx = ZeroMask; - V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); - V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); - V1InUse |= (ZeroMask != V1Idx); - V2InUse |= (ZeroMask != V2Idx); - } - } + SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, + DAG, V1InUse, V2InUse); - if (V1InUse) - V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); - if (V2InUse) - V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + // If both V1 and V2 are in use and we can use a direct blend or an unpack, + // do so. This avoids using them to handle blends-with-zero which is + // important as a single pshufb is significantly faster for that. + if (V1InUse && V2InUse) { + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) + return Blend; - // If we need shuffled inputs from both, blend the two. - if (V1InUse && V2InUse) - return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); - if (V1InUse) - return V1; // Single inputs are easy. - if (V2InUse) - return V2; // Single inputs are easy. - // Shuffling to a zeroable vector. - return getZeroVector(MVT::v16i8, Subtarget, DAG, DL); + // We can use an unpack to do the blending rather than an or in some + // cases. Even though the or may be (very minorly) more efficient, we + // preference this lowering because there are common cases where part of + // the complexity of the shuffles goes away when we do the final blend as + // an unpack. + // FIXME: It might be worth trying to detect if the unpack-feeding + // shuffles will both be pshufb, in which case we shouldn't bother with + // this. + if (SDValue Unpack = + lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Unpack; + } + + return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return BitBlend; + // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. @@ -9703,7 +8868,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; SDValue ByteClearMask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, - DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); + DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); @@ -9720,72 +8885,58 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Result; } - int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + // Handle multi-input cases by blending single-input shuffles. + if (NumV2Elements > 0) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, + Mask, DAG); - auto buildBlendMasks = [](MutableArrayRef<int> HalfMask, - MutableArrayRef<int> V1HalfBlendMask, - MutableArrayRef<int> V2HalfBlendMask) { - for (int i = 0; i < 8; ++i) - if (HalfMask[i] >= 0 && HalfMask[i] < 16) { - V1HalfBlendMask[i] = HalfMask[i]; - HalfMask[i] = i; - } else if (HalfMask[i] >= 16) { - V2HalfBlendMask[i] = HalfMask[i] - 16; - HalfMask[i] = i + 8; - } - }; - buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); - buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); + // The fallback path for single-input shuffles widens this into two v8i16 + // vectors with unpacks, shuffles those, and then pulls them back together + // with a pack. + SDValue V = V1; - SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; ++i) + if (Mask[i] >= 0) + (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; - auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask, - MutableArrayRef<int> HiBlendMask) { - SDValue V1, V2; - // Check if any of the odd lanes in the v16i8 are used. If not, we can mask - // them out and avoid using UNPCK{L,H} to extract the elements of V as - // i16s. - if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), - [](int M) { return M >= 0 && M % 2 == 1; }) && - std::none_of(HiBlendMask.begin(), HiBlendMask.end(), - [](int M) { return M >= 0 && M % 2 == 1; })) { - // Use a mask to drop the high bytes. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); - V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, - DAG.getConstant(0x00FF, MVT::v8i16)); - - // This will be a single vector shuffle instead of a blend so nuke V2. - V2 = DAG.getUNDEF(MVT::v8i16); - - // Squash the masks to point directly into V1. - for (int &M : LoBlendMask) - if (M >= 0) - M /= 2; - for (int &M : HiBlendMask) - if (M >= 0) - M /= 2; - } else { - // Otherwise just unpack the low half of V into V1 and the high half into - // V2 so that we can blend them as i16s. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, - DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); - } + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); - SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); - SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); - return std::make_pair(BlendedLo, BlendedHi); - }; - SDValue V1Lo, V1Hi, V2Lo, V2Hi; - std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); - std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); + SDValue VLoHalf, VHiHalf; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, + DAG.getConstant(0x00FF, DL, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke VHiHalf. + VHiHalf = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into VLoHalf. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into VLoHalf and the high half into + // VHiHalf so that we can blend them as i16s. + VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, + DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } - SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); - SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } @@ -9871,7 +9022,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, return true; } -/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// \brief Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all @@ -9892,14 +9043,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT ScalarVT = VT.getScalarType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); - SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, - DAG.getIntPtrConstant(SplitNumElements)); - SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(0)); - SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, - DAG.getIntPtrConstant(SplitNumElements)); + // Rather than splitting build-vectors, just build two narrower build + // vectors. This helps shuffling with splats and zeros. + auto SplitVector = [&](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V->getOperand(0); + + MVT OrigVT = V.getSimpleValueType(); + int OrigNumElements = OrigVT.getVectorNumElements(); + int OrigSplitNumElements = OrigNumElements / 2; + MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); + + SDValue LoV, HiV; + + auto *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV) { + LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(0, DL)); + HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(OrigSplitNumElements, DL)); + } else { + + SmallVector<SDValue, 16> LoOps, HiOps; + for (int i = 0; i < OrigSplitNumElements; ++i) { + LoOps.push_back(BV->getOperand(i)); + HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); + } + LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); + HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); + } + return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV), + DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV)); + }; + + SDValue LoV1, HiV1, LoV2, HiV2; + std::tie(LoV1, HiV1) = SplitVector(V1); + std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef<int> HalfMask) { @@ -10046,7 +9226,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be - // less expensive. The flags track wether the given lane contains an element + // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool LaneCrossing[2] = {false, false}; for (int i = 0, Size = Mask.size(); i < Size; ++i) @@ -10071,7 +9251,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, // allow folding it into a memory operand. unsigned PERMMask = 3 | 2 << 4; SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), - V1, DAG.getConstant(PERMMask, MVT::i8)); + V1, DAG.getConstant(PERMMask, DL, MVT::i8)); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } @@ -10086,33 +9266,49 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + // TODO: If minimizing size and one of the inputs is a zero vector and the + // the zero vector has only one use, we could use a VPERM2X128 to save the + // instruction bytes needed to explicitly generate the zero vector. + // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Blend; - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - // Check for patterns which can be matched with a single insert of a 128-bit - // subvector. - bool OnlyUsesV1 = isShuffleEquivalent(Mask, 0, 1, 0, 1); - if (OnlyUsesV1 || isShuffleEquivalent(Mask, 0, 1, 4, 5)) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); - } - if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, - DAG.getIntPtrConstant(2)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); - } - - // Otherwise form a 128-bit permutation. - // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. + if (!IsV1Zero && !IsV2Zero) { + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0, DL)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + } + + // Otherwise form a 128-bit permutation. After accounting for undefs, + // convert the 64-bit shuffle mask selection values into 128-bit + // selection bits by dividing the indexes by 2 and shifting into positions + // defined by a vperm2*128 instruction's immediate control byte. + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + int MaskLO = Mask[0]; if (MaskLO == SM_SentinelUndef) MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; @@ -10122,8 +9318,29 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + + // If either input is a zero vector, replace it with an undef input. + // Shuffle mask values < 4 are selecting elements of V1. + // Shuffle mask values >= 4 are selecting elements of V2. + // Adjust each half of the permute mask by clearing the half that was + // selecting the zero vector and setting the zero mask bit. + if (IsV1Zero) { + V1 = DAG.getUNDEF(VT); + if (MaskLO < 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI < 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + if (IsV2Zero) { + V2 = DAG.getUNDEF(VT); + if (MaskLO >= 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI >= 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, - DAG.getConstant(PermMask, MVT::i8)); + DAG.getConstant(PermMask, DL, MVT::i8)); } /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then @@ -10245,23 +9462,27 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, Mask, Subtarget, DAG)) return Broadcast; + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowerid with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, - DAG.getConstant(VPERMILPMask, MVT::i8)); + DAG.getConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget->hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, @@ -10270,19 +9491,14 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - - // If we have a single input to the zero element, insert that into V1 if we - // can do so cheaply. - int NumV2Elements = - std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( - MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) - return Insertion; + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -10296,7 +9512,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, - DAG.getConstant(SHUFPDMask, MVT::i8)); + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && (Mask[1] == -1 || Mask[1] < 2) && @@ -10305,7 +9521,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, - DAG.getConstant(SHUFPDMask, MVT::i8)); + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -10353,7 +9569,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10372,21 +9588,30 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ISD::BITCAST, DL, MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); } // AVX2 provides a direct instruction for permuting a single input across // lanes. if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -10422,7 +9647,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10432,15 +9657,26 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); + + // Use even/odd duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); + if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -10457,7 +9693,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) - : DAG.getConstant(Mask[i], MVT::i32); + : DAG.getConstant(Mask[i], DL, MVT::i32); if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) return DAG.getNode( X86ISD::VPERMILPV, DL, MVT::v8f32, V1, @@ -10506,12 +9742,19 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10523,22 +9766,35 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (isSingleInputShuffleMask(Mask)) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); + if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); } + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + // If the shuffle patterns aren't repeated but it is a single input, directly // generate a cross-lane VPERMD instruction. if (isSingleInputShuffleMask(Mask)) { SDValue VPermMask[8]; for (int i = 0; i < 8; ++i) VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) - : DAG.getConstant(Mask[i], MVT::i32); + : DAG.getConstant(Mask[i], DL, MVT::i32); return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8i32, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); @@ -10570,8 +9826,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10580,19 +9843,29 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(Mask, - // First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i16 // element types. @@ -10600,6 +9873,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // As this is a single-input shuffle, the repeated mask should be + // a strictly valid v8i16 mask that we can pass through to the v8i16 + // lowering to handle even the v16 case. + return lowerV8I16GeneralSingleInputVectorShuffle( + DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); + } + SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { @@ -10609,8 +9891,8 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, int M = i < 8 ? Mask[i] : Mask[i] - 8; assert(M >= 0 && M < 8 && "Invalid single-input mask!"); - PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); - PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i16, @@ -10645,8 +9927,15 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) return Broadcast; @@ -10658,20 +9947,30 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Note that these are repeated 128-bit lane unpacks, not unpacks across all // 256-bit lanes. if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + V1, V2, Mask, + {// First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); if (isShuffleEquivalent( - Mask, - // First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + V1, V2, Mask, + {// First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (isSingleInputShuffleMask(Mask)) { // There are no generalized cross-lane shuffle operations available on i8 // element types. @@ -10684,7 +9983,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, PSHUFBMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) - : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, + MVT::i8); return DAG.getNode( X86ISD::PSHUFB, DL, MVT::v32i8, V1, @@ -10713,6 +10013,18 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> Mask = SVOp->getMask(); + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumElts = VT.getVectorNumElements(); + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { + return M >= NumElts; + }); + + if (NumV2Elements == 1 && Mask[0] >= NumElts) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can // check for those subtargets here and avoid much of the subtarget querying in // the per-vector-type lowering routines. With AVX1 we have essentially *zero* @@ -10765,9 +10077,9 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); // FIXME: Implement direct support for this type! @@ -10786,13 +10098,17 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(Mask, - 2, 18, 3, 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); // FIXME: Implement direct support for this type! @@ -10812,9 +10128,9 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); // FIXME: Implement direct support for this type! @@ -10833,13 +10149,17 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(Mask, - 2, 18, 3, 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31)) + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); // FIXME: Implement direct support for this type! @@ -10893,8 +10213,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, "Cannot lower 512-bit vectors w/ basic ISA!"); // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, - Mask, Subtarget, DAG)) + if (SDValue Broadcast = + lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) return Broadcast; // Dispatch to each element type for lowering. If we don't have supprot for @@ -10970,6 +10290,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } + // We actually see shuffles that are entirely re-arrangements of a set of + // zero inputs. This mostly happens while decomposing complex shuffles into + // simple ones. Directly lower these as a buildvector of zeros. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.all()) + return getZeroVector(VT, Subtarget, DAG, dl); + // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 @@ -11057,1586 +10384,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, llvm_unreachable("Unimplemented!"); } - -//===----------------------------------------------------------------------===// -// Legacy vector shuffle lowering -// -// This code is the legacy code handling vector shuffles until the above -// replaces its functionality and performance. -//===----------------------------------------------------------------------===// - -static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41, - bool hasInt256, unsigned *MaskOut = nullptr) { - MVT EltVT = VT.getVectorElementType(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return false; - - if (!hasSSE41 || EltVT == MVT::i8) - return false; - if (!hasInt256 && VT == MVT::v16i16) - return false; - - unsigned MaskValue = 0; - unsigned NumElems = VT.getVectorNumElements(); - // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. - unsigned NumLanes = (NumElems - 1) / 8 + 1; - unsigned NumElemsInLane = NumElems / NumLanes; - - // Blend for v16i16 should be symetric for the both lanes. - for (unsigned i = 0; i < NumElemsInLane; ++i) { - - int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; - int EltIdx = MaskVals[i]; - - if ((EltIdx < 0 || EltIdx == (int)i) && - (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) - continue; - - if (((unsigned)EltIdx == (i + NumElems)) && - (SndLaneEltIdx < 0 || - (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) - MaskValue |= (1 << i); - else - return false; - } - - if (MaskOut) - *MaskOut = MaskValue; - return true; -} - -// Try to lower a shuffle node into a simple blend instruction. -// This function assumes isBlendMask returns true for this -// SuffleVectorSDNode -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - unsigned MaskValue, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), - Subtarget->hasInt256() && "Trying to lower a " - "VECTOR_SHUFFLE to a Blend but " - "with the wrong mask")); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); - - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); - } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); -} - -/// In vector type \p VT, return true if the element at index \p InputIdx -/// falls on a different 128-bit lane than \p OutputIdx. -static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, - unsigned OutputIdx) { - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; -} - -/// Generate a PSHUFB if possible. Selects elements from \p V1 according to -/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to -/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p -/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a -/// zero. -static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl, - SelectionDAG &DAG) { - MVT VT = V1.getSimpleValueType(); - assert(VT.is128BitVector() || VT.is256BitVector()); - - MVT EltVT = VT.getVectorElementType(); - unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; - unsigned NumElts = VT.getVectorNumElements(); - - SmallVector<SDValue, 32> PshufbMask; - for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { - int InputIdx = MaskVals[OutputIdx]; - unsigned InputByteIdx; - - if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) - InputByteIdx = 0x80; - else { - // Cross lane is not allowed. - if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) - return SDValue(); - InputByteIdx = InputIdx * EltSizeInBytes; - // Index is an byte offset within the 128-bit lane. - InputByteIdx &= 0xf; - } - - for (unsigned j = 0; j < EltSizeInBytes; ++j) { - PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); - if (InputByteIdx != 0x80) - ++InputByteIdx; - } - } - - MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); - if (ShufVT != VT) - V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); - return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); -} - -// v8i16 shuffles - Prefer shuffles in the following order: -// 1. [all] pshuflw, pshufhw, optional move -// 2. [ssse3] 1 x pshufb -// 3. [ssse3] 2 x pshufb + 1 x por -// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -static SDValue -LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - SmallVector<int, 8> MaskVals; - - // Determine if more than 1 of the words in each of the low and high quadwords - // of the result come from the same quadword of one of the two inputs. Undef - // mask values count as coming from any quadword, for better codegen. - // - // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input - // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. - unsigned LoQuad[] = { 0, 0, 0, 0 }; - unsigned HiQuad[] = { 0, 0, 0, 0 }; - // Indices of quads used. - std::bitset<4> InputQuads; - for (unsigned i = 0; i < 8; ++i) { - unsigned *Quad = i < 4 ? LoQuad : HiQuad; - int EltIdx = SVOp->getMaskElt(i); - MaskVals.push_back(EltIdx); - if (EltIdx < 0) { - ++Quad[0]; - ++Quad[1]; - ++Quad[2]; - ++Quad[3]; - continue; - } - ++Quad[EltIdx / 4]; - InputQuads.set(EltIdx / 4); - } - - int BestLoQuad = -1; - unsigned MaxQuad = 1; - for (unsigned i = 0; i < 4; ++i) { - if (LoQuad[i] > MaxQuad) { - BestLoQuad = i; - MaxQuad = LoQuad[i]; - } - } - - int BestHiQuad = -1; - MaxQuad = 1; - for (unsigned i = 0; i < 4; ++i) { - if (HiQuad[i] > MaxQuad) { - BestHiQuad = i; - MaxQuad = HiQuad[i]; - } - } - - // For SSSE3, If all 8 words of the result come from only 1 quadword of each - // of the two input vectors, shuffle them into one input vector so only a - // single pshufb instruction is necessary. If there are more than 2 input - // quads, disable the next transformation since it does not help SSSE3. - bool V1Used = InputQuads[0] || InputQuads[1]; - bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3()) { - if (InputQuads.count() == 2 && V1Used && V2Used) { - BestLoQuad = InputQuads[0] ? 0 : 1; - BestHiQuad = InputQuads[2] ? 2 : 3; - } - if (InputQuads.count() > 2) { - BestLoQuad = -1; - BestHiQuad = -1; - } - } - - // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update - // the shuffle mask. If a quad is scored as -1, that means that it contains - // words from all 4 input quadwords. - SDValue NewV; - if (BestLoQuad >= 0 || BestHiQuad >= 0) { - int MaskV[] = { - BestLoQuad < 0 ? 0 : BestLoQuad, - BestHiQuad < 0 ? 1 : BestHiQuad - }; - NewV = DAG.getVectorShuffle(MVT::v2i64, dl, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); - NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); - - // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the - // source words for the shuffle, to aid later transformations. - bool AllWordsInNewV = true; - bool InOrder[2] = { true, true }; - for (unsigned i = 0; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx != (int)i) - InOrder[i/4] = false; - if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) - continue; - AllWordsInNewV = false; - break; - } - - bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; - if (AllWordsInNewV) { - for (int i = 0; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx < 0) - continue; - idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; - if ((idx != i) && idx < 4) - pshufhw = false; - if ((idx != i) && idx > 3) - pshuflw = false; - } - V1 = NewV; - V2Used = false; - BestLoQuad = 0; - BestHiQuad = 1; - } - - // If we've eliminated the use of V2, and the new mask is a pshuflw or - // pshufhw, that's as cheap as it gets. Return the new shuffle. - if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { - unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; - unsigned TargetMask = 0; - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, - DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): - getShufflePSHUFLWImmediate(SVOp); - V1 = NewV.getOperand(0); - return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); - } - } - - // Promote splats to a larger type which usually leads to more efficient code. - // FIXME: Is this true if pshufb is available? - if (SVOp->isSplat()) - return PromoteSplat(SVOp, DAG); - - // If we have SSSE3, and all words of the result are from 1 input vector, - // case 2 is generated, otherwise case 3 is generated. If no SSSE3 - // is present, fall back to case 4. - if (Subtarget->hasSSSE3()) { - SmallVector<SDValue,16> pshufbMask; - - // If we have elements from both input vectors, set the high bit of the - // shuffle mask element to zero out elements that come from V2 in the V1 - // mask, and elements that come from V1 in the V2 mask, so that the two - // results can be OR'd together. - bool TwoInputs = V1Used && V2Used; - V1 = getPSHUFB(MaskVals, V1, dl, DAG); - if (!TwoInputs) - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - - // Calculate the shuffle mask for the second input, shuffle it, and - // OR it with the first shuffled input. - CommuteVectorShuffleMask(MaskVals, 8); - V2 = getPSHUFB(MaskVals, V2, dl, DAG); - V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - } - - // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, - // and update MaskVals with new element order. - std::bitset<8> InOrder; - if (BestLoQuad >= 0) { - int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; - for (int i = 0; i != 4; ++i) { - int idx = MaskVals[i]; - if (idx < 0) { - InOrder.set(i); - } else if ((idx / 4) == BestLoQuad) { - MaskV[i] = idx & 3; - InOrder.set(i); - } - } - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), - &MaskV[0]); - - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, - NewV.getOperand(0), - getShufflePSHUFLWImmediate(SVOp), DAG); - } - } - - // If BestHi >= 0, generate a pshufhw to put the high elements in order, - // and update MaskVals with the new element order. - if (BestHiQuad >= 0) { - int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; - for (unsigned i = 4; i != 8; ++i) { - int idx = MaskVals[i]; - if (idx < 0) { - InOrder.set(i); - } else if ((idx / 4) == BestHiQuad) { - MaskV[i] = (idx & 3) + 4; - InOrder.set(i); - } - } - NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), - &MaskV[0]); - - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); - NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, - NewV.getOperand(0), - getShufflePSHUFHWImmediate(SVOp), DAG); - } - } - - // In case BestHi & BestLo were both -1, which means each quadword has a word - // from each of the four input quadwords, calculate the InOrder bitvector now - // before falling through to the insert/extract cleanup. - if (BestLoQuad == -1 && BestHiQuad == -1) { - NewV = V1; - for (int i = 0; i != 8; ++i) - if (MaskVals[i] < 0 || MaskVals[i] == i) - InOrder.set(i); - } - - // The other elements are put in the right place using pextrw and pinsrw. - for (unsigned i = 0; i != 8; ++i) { - if (InOrder[i]) - continue; - int EltIdx = MaskVals[i]; - if (EltIdx < 0) - continue; - SDValue ExtOp = (EltIdx < 8) ? - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getIntPtrConstant(EltIdx)) : - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, - DAG.getIntPtrConstant(EltIdx - 8)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, - DAG.getIntPtrConstant(i)); - } - return NewV; -} - -/// \brief v16i16 shuffles -/// -/// FIXME: We only support generation of a single pshufb currently. We can -/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as -/// well (e.g 2 x pshufb + 1 x por). -static SDValue -LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - - if (V2.getOpcode() != ISD::UNDEF) - return SDValue(); - - SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); - return getPSHUFB(MaskVals, V1, dl, DAG); -} - -// v16i8 shuffles - Prefer shuffles in the following order: -// 1. [ssse3] 1 x pshufb -// 2. [ssse3] 2 x pshufb + 1 x por -// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw -static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, - const X86Subtarget* Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - ArrayRef<int> MaskVals = SVOp->getMask(); - - // Promote splats to a larger type which usually leads to more efficient code. - // FIXME: Is this true if pshufb is available? - if (SVOp->isSplat()) - return PromoteSplat(SVOp, DAG); - - // If we have SSSE3, case 1 is generated when all result bytes come from - // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is - // present, fall back to case 3. - - // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (Subtarget->hasSSSE3()) { - SmallVector<SDValue,16> pshufbMask; - - // If all result elements are from one input vector, then only translate - // undef mask values to 0x80 (zero out result) in the pshufb mask. - // - // Otherwise, we have elements from both input vectors, and must zero out - // elements that come from V2 in the first mask, and V1 in the second mask - // so that we can OR them together. - for (unsigned i = 0; i != 16; ++i) { - int EltIdx = MaskVals[i]; - if (EltIdx < 0 || EltIdx >= 16) - EltIdx = 0x80; - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - } - V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, pshufbMask)); - - // As PSHUFB will zero elements with negative indices, it's safe to ignore - // the 2nd operand if it's undefined or zero. - if (V2.getOpcode() == ISD::UNDEF || - ISD::isBuildVectorAllZeros(V2.getNode())) - return V1; - - // Calculate the shuffle mask for the second input, shuffle it, and - // OR it with the first shuffled input. - pshufbMask.clear(); - for (unsigned i = 0; i != 16; ++i) { - int EltIdx = MaskVals[i]; - EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - } - V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, - DAG.getNode(ISD::BUILD_VECTOR, dl, - MVT::v16i8, pshufbMask)); - return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - } - - // No SSSE3 - Calculate in place words and then fix all out of place words - // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from - // the 16 different words that comprise the two doublequadword input vectors. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); - SDValue NewV = V1; - for (int i = 0; i != 8; ++i) { - int Elt0 = MaskVals[i*2]; - int Elt1 = MaskVals[i*2+1]; - - // This word of the result is all undef, skip it. - if (Elt0 < 0 && Elt1 < 0) - continue; - - // This word of the result is already in the correct place, skip it. - if ((Elt0 == i*2) && (Elt1 == i*2+1)) - continue; - - SDValue Elt0Src = Elt0 < 16 ? V1 : V2; - SDValue Elt1Src = Elt1 < 16 ? V1 : V2; - SDValue InsElt; - - // If Elt0 and Elt1 are defined, are consecutive, and can be load - // using a single extract together, load it and store it. - if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, - DAG.getIntPtrConstant(Elt1 / 2)); - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, - DAG.getIntPtrConstant(i)); - continue; - } - - // If Elt1 is defined, extract it from the appropriate source. If the - // source byte is not also odd, shift the extracted word left 8 bits - // otherwise clear the bottom 8 bits if we need to do an or. - if (Elt1 >= 0) { - InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, - DAG.getIntPtrConstant(Elt1 / 2)); - if ((Elt1 & 1) == 0) - InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, - DAG.getConstant(8, - TLI.getShiftAmountTy(InsElt.getValueType()))); - else if (Elt0 >= 0) - InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, - DAG.getConstant(0xFF00, MVT::i16)); - } - // If Elt0 is defined, extract it from the appropriate source. If the - // source byte is not also even, shift the extracted word right 8 bits. If - // Elt1 was also defined, OR the extracted values together before - // inserting them in the result. - if (Elt0 >= 0) { - SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, - Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); - if ((Elt0 & 1) != 0) - InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, - DAG.getConstant(8, - TLI.getShiftAmountTy(InsElt0.getValueType()))); - else if (Elt1 >= 0) - InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, - DAG.getConstant(0x00FF, MVT::i16)); - InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) - : InsElt0; - } - NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, - DAG.getIntPtrConstant(i)); - } - return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); -} - -// v32i8 shuffles - Translate to VPSHUFB if possible. -static -SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); - - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - // VPSHUFB may be generated if - // (1) one of input vector is undefined or zeroinitializer. - // The mask value 0x80 puts 0 in the corresponding slot of the vector. - // And (2) the mask indexes don't cross the 128-bit lane. - if (VT != MVT::v32i8 || !Subtarget->hasInt256() || - (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) - return SDValue(); - - if (V1IsAllZero && !V2IsAllZero) { - CommuteVectorShuffleMask(MaskVals, 32); - V1 = V2; - } - return getPSHUFB(MaskVals, V1, dl, DAG); -} - -/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide -/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be -/// done when every pair / quad of shuffle mask elements point to elements in -/// the right sequence. e.g. -/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> -static -SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, - SelectionDAG &DAG) { - MVT VT = SVOp->getSimpleValueType(0); - SDLoc dl(SVOp); - unsigned NumElems = VT.getVectorNumElements(); - MVT NewVT; - unsigned Scale; - switch (VT.SimpleTy) { - default: llvm_unreachable("Unexpected!"); - case MVT::v2i64: - case MVT::v2f64: - return SDValue(SVOp, 0); - case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; - case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; - case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; - case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; - case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; - case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; - } - - SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i != NumElems; i += Scale) { - int StartIdx = -1; - for (unsigned j = 0; j != Scale; ++j) { - int EltIdx = SVOp->getMaskElt(i+j); - if (EltIdx < 0) - continue; - if (StartIdx < 0) - StartIdx = (EltIdx / Scale); - if (EltIdx != (int)(StartIdx*Scale + j)) - return SDValue(); - } - MaskVec.push_back(StartIdx); - } - - SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); - SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); - return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); -} - -/// getVZextMovL - Return a zero-extending vector move low node. -/// -static SDValue getVZextMovL(MVT VT, MVT OpVT, - SDValue SrcOp, SelectionDAG &DAG, - const X86Subtarget *Subtarget, SDLoc dl) { - if (VT == MVT::v2f64 || VT == MVT::v4f32) { - LoadSDNode *LD = nullptr; - if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) - LD = dyn_cast<LoadSDNode>(SrcOp); - if (!LD) { - // movssrr and movsdrr do not clear top bits. Try to use movd, movq - // instead. - MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; - if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && - SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && - SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && - SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { - // PR2108 - OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - OpVT, - SrcOp.getOperand(0) - .getOperand(0)))); - } - } - } - - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::BITCAST, dl, - OpVT, SrcOp))); -} - -/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles -/// which could not be matched by any known target speficic shuffle -static SDValue -LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - - SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); - if (NewOp.getNode()) - return NewOp; - - MVT VT = SVOp->getSimpleValueType(0); - - unsigned NumElems = VT.getVectorNumElements(); - unsigned NumLaneElems = NumElems / 2; - - SDLoc dl(SVOp); - MVT EltVT = VT.getVectorElementType(); - MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); - SDValue Output[2]; - - SmallVector<int, 16> Mask; - for (unsigned l = 0; l < 2; ++l) { - // Build a shuffle mask for the output, discovering on the fly which - // input vectors to use as shuffle operands (recorded in InputUsed). - // If building a suitable shuffle vector proves too hard, then bail - // out with UseBuildVector set. - bool UseBuildVector = false; - int InputUsed[2] = { -1, -1 }; // Not yet discovered. - unsigned LaneStart = l * NumLaneElems; - for (unsigned i = 0; i != NumLaneElems; ++i) { - // The mask element. This indexes into the input. - int Idx = SVOp->getMaskElt(i+LaneStart); - if (Idx < 0) { - // the mask element does not index into any input vector. - Mask.push_back(-1); - continue; - } - - // The input vector this mask element indexes into. - int Input = Idx / NumLaneElems; - - // Turn the index into an offset from the start of the input vector. - Idx -= Input * NumLaneElems; - - // Find or create a shuffle vector operand to hold this input. - unsigned OpNo; - for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { - if (InputUsed[OpNo] == Input) - // This input vector is already an operand. - break; - if (InputUsed[OpNo] < 0) { - // Create a new operand for this input vector. - InputUsed[OpNo] = Input; - break; - } - } - - if (OpNo >= array_lengthof(InputUsed)) { - // More than two input vectors used! Give up on trying to create a - // shuffle vector. Insert all elements into a BUILD_VECTOR instead. - UseBuildVector = true; - break; - } - - // Add the mask index for the new shuffle vector. - Mask.push_back(Idx + OpNo * NumLaneElems); - } - - if (UseBuildVector) { - SmallVector<SDValue, 16> SVOps; - for (unsigned i = 0; i != NumLaneElems; ++i) { - // The mask element. This indexes into the input. - int Idx = SVOp->getMaskElt(i+LaneStart); - if (Idx < 0) { - SVOps.push_back(DAG.getUNDEF(EltVT)); - continue; - } - - // The input vector this mask element indexes into. - int Input = Idx / NumElems; - - // Turn the index into an offset from the start of the input vector. - Idx -= Input * NumElems; - - // Extract the vector element by hand. - SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, - SVOp->getOperand(Input), - DAG.getIntPtrConstant(Idx))); - } - - // Construct the output using a BUILD_VECTOR. - Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); - } else if (InputUsed[0] < 0) { - // No input vectors were used! The result is undefined. - Output[l] = DAG.getUNDEF(NVT); - } else { - SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), - (InputUsed[0] % 2) * NumLaneElems, - DAG, dl); - // If only one input was used, use an undefined vector for the other. - SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : - Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), - (InputUsed[1] % 2) * NumLaneElems, DAG, dl); - // At least one input vector was used. Create a new shuffle vector. - Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); - } - - Mask.clear(); - } - - // Concatenate the result back - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); -} - -/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with -/// 4 elements, and match them with several different shuffle types. -static SDValue -LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - SDLoc dl(SVOp); - MVT VT = SVOp->getSimpleValueType(0); - - assert(VT.is128BitVector() && "Unsupported vector size"); - - std::pair<int, int> Locs[4]; - int Mask1[] = { -1, -1, -1, -1 }; - SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); - - unsigned NumHi = 0; - unsigned NumLo = 0; - for (unsigned i = 0; i != 4; ++i) { - int Idx = PermMask[i]; - if (Idx < 0) { - Locs[i] = std::make_pair(-1, -1); - } else { - assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); - if (Idx < 4) { - Locs[i] = std::make_pair(0, NumLo); - Mask1[NumLo] = Idx; - NumLo++; - } else { - Locs[i] = std::make_pair(1, NumHi); - if (2+NumHi < 4) - Mask1[2+NumHi] = Idx; - NumHi++; - } - } - } - - if (NumLo <= 2 && NumHi <= 2) { - // If no more than two elements come from either vector. This can be - // implemented with two shuffles. First shuffle gather the elements. - // The second shuffle, which takes the first shuffle as both of its - // vector operands, put the elements into the right order. - V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - - int Mask2[] = { -1, -1, -1, -1 }; - - for (unsigned i = 0; i != 4; ++i) - if (Locs[i].first != -1) { - unsigned Idx = (i < 2) ? 0 : 4; - Idx += Locs[i].first * 2 + Locs[i].second; - Mask2[i] = Idx; - } - - return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); - } - - if (NumLo == 3 || NumHi == 3) { - // Otherwise, we must have three elements from one vector, call it X, and - // one element from the other, call it Y. First, use a shufps to build an - // intermediate vector with the one element from Y and the element from X - // that will be in the same half in the final destination (the indexes don't - // matter). Then, use a shufps to build the final vector, taking the half - // containing the element from Y from the intermediate, and the other half - // from X. - if (NumHi == 3) { - // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, 4); - std::swap(V1, V2); - } - - // Find the element from V2. - unsigned HiIndex; - for (HiIndex = 0; HiIndex < 3; ++HiIndex) { - int Val = PermMask[HiIndex]; - if (Val < 0) - continue; - if (Val >= 4) - break; - } - - Mask1[0] = PermMask[HiIndex]; - Mask1[1] = -1; - Mask1[2] = PermMask[HiIndex^1]; - Mask1[3] = -1; - V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - - if (HiIndex >= 2) { - Mask1[0] = PermMask[0]; - Mask1[1] = PermMask[1]; - Mask1[2] = HiIndex & 1 ? 6 : 4; - Mask1[3] = HiIndex & 1 ? 4 : 6; - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - } - - Mask1[0] = HiIndex & 1 ? 2 : 0; - Mask1[1] = HiIndex & 1 ? 0 : 2; - Mask1[2] = PermMask[2]; - Mask1[3] = PermMask[3]; - if (Mask1[2] >= 0) - Mask1[2] += 4; - if (Mask1[3] >= 0) - Mask1[3] += 4; - return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); - } - - // Break it into (shuffle shuffle_hi, shuffle_lo). - int LoMask[] = { -1, -1, -1, -1 }; - int HiMask[] = { -1, -1, -1, -1 }; - - int *MaskPtr = LoMask; - unsigned MaskIdx = 0; - unsigned LoIdx = 0; - unsigned HiIdx = 2; - for (unsigned i = 0; i != 4; ++i) { - if (i == 2) { - MaskPtr = HiMask; - MaskIdx = 1; - LoIdx = 0; - HiIdx = 2; - } - int Idx = PermMask[i]; - if (Idx < 0) { - Locs[i] = std::make_pair(-1, -1); - } else if (Idx < 4) { - Locs[i] = std::make_pair(MaskIdx, LoIdx); - MaskPtr[LoIdx] = Idx; - LoIdx++; - } else { - Locs[i] = std::make_pair(MaskIdx, HiIdx); - MaskPtr[HiIdx] = Idx; - HiIdx++; - } - } - - SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); - SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); - int MaskOps[] = { -1, -1, -1, -1 }; - for (unsigned i = 0; i != 4; ++i) - if (Locs[i].first != -1) - MaskOps[i] = Locs[i].first * 4 + Locs[i].second; - return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); -} - -static bool MayFoldVectorLoad(SDValue V) { - while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) - V = V.getOperand(0); - if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && - V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) - // BUILD_VECTOR (load), undef - V = V.getOperand(0); - - return MayFoldLoad(V); -} - -static -SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - - // Canonizalize to v2f64. - V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); - return DAG.getNode(ISD::BITCAST, dl, VT, - getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, - V1, DAG)); -} - -static -SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, - bool HasSSE2) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - - assert(VT != MVT::v2i64 && "unsupported shuffle type"); - - if (HasSSE2 && VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); - - // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) - return DAG.getNode(ISD::BITCAST, dl, VT, - getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), - DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); -} - -static -SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - - assert((VT == MVT::v4i32 || VT == MVT::v4f32) && - "unsupported shuffle type"); - - if (V2.getOpcode() == ISD::UNDEF) - V2 = V1; - - // v4i32 or v4f32 - return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); -} - -static -SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - unsigned NumElems = VT.getVectorNumElements(); - - // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second - // operand of these instructions is only memory, so check if there's a - // potencial load folding here, otherwise use SHUFPS or MOVSD to match the - // same masks. - bool CanFoldLoad = false; - - // Trivial case, when V2 comes from a load. - if (MayFoldVectorLoad(V2)) - CanFoldLoad = true; - - // When V1 is a load, it can be folded later into a store in isel, example: - // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) - // turns into: - // (MOVLPSmr addr:$src1, VR128:$src2) - // So, recognize this potential and also use MOVLPS or MOVLPD - else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) - CanFoldLoad = true; - - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - if (CanFoldLoad) { - if (HasSSE2 && NumElems == 2) - return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); - - if (NumElems == 4) - // If we don't care about the second element, proceed to use movss. - if (SVOp->getMaskElt(1) != -1) - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); - } - - // movl and movlp will both match v2i64, but v2i64 is never matched by - // movl earlier because we make it strict to avoid messing with the movlp load - // folding logic (see the code above getMOVLP call). Match it here then, - // this is horrible, but will stay like this until we move all shuffle - // matching to x86 specific nodes. Note that for the 1st condition all - // types are matched with movsd. - if (HasSSE2) { - // FIXME: isMOVLMask should be checked and matched before getMOVLP, - // as to remove this logic from here, as much as possible - if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); - } - - assert(VT != MVT::v4i32 && "unsupported shuffle type"); - - // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, - getShuffleSHUFImmediate(SVOp), DAG); -} - -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -// It is only safe to call this function if isINSERTPSMask is true for -// this shufflevector mask. -static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, - SelectionDAG &DAG) { - // Generate an insertps instruction when inserting an f32 from memory onto a - // v4f32 or when copying a member from one v4f32 to another. - // We also use it for transferring i32 from one register to another, - // since it simply copies the same bits. - // If we're transferring an i32 from memory to a specific element in a - // register, we output a generic DAG that will match the PINSRD - // instruction. - MVT VT = SVOp->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); - auto Mask = SVOp->getMask(); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "unsupported vector type for insertps/pinsrd"); - - auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; }; - auto FromV2Predicate = [](const int &i) { return i >= 4; }; - int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate); - - SDValue From; - SDValue To; - unsigned DestIndex; - if (FromV1 == 1) { - From = V1; - To = V2; - DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) - - Mask.begin(); - - // If we have 1 element from each vector, we have to check if we're - // changing V1's element's place. If so, we're done. Otherwise, we - // should assume we're changing V2's element's place and behave - // accordingly. - int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); - assert(DestIndex <= INT32_MAX && "truncated destination index"); - if (FromV1 == FromV2 && - static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { - From = V2; - To = V1; - DestIndex = - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); - } - } else { - assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 && - "More than one element from V1 and from V2, or no elements from one " - "of the vectors. This case should not have returned true from " - "isINSERTPSMask"); - From = V2; - To = V1; - DestIndex = - std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin(); - } - - // Get an index into the source vector in the range [0,4) (the mask is - // in the range [0,8) because it can address V1 and V2) - unsigned SrcIndex = Mask[DestIndex] % 4; - if (MayFoldLoad(From)) { - // Trivial case, when From comes from a load and is only used by the - // shuffle. Make it use insertps from the vector that we need from that - // load. - SDValue NewLoad = - NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG); - if (!NewLoad.getNode()) - return SDValue(); - - if (EVT == MVT::f32) { - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad); - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector, - InsertpsMask); - } else { // EVT == MVT::i32 - // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT - // instruction, to match the PINSRD instruction, which loads an i32 to a - // certain vector element. - return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad, - DAG.getConstant(DestIndex, MVT::i32)); - } - } - - // Vector-element-to-vector - SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask); -} - -// Reduce a vector shuffle to zext. -static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - // PMOVZX is only available from SSE41. - if (!Subtarget->hasSSE41()) - return SDValue(); - - MVT VT = Op.getSimpleValueType(); - - // Only AVX2 support 256-bit vector integer extending. - if (!Subtarget->hasInt256() && VT.is256BitVector()) - return SDValue(); - - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDLoc DL(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - unsigned NumElems = VT.getVectorNumElements(); - - // Extending is an unary operation and the element type of the source vector - // won't be equal to or larger than i64. - if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || - VT.getVectorElementType() == MVT::i64) - return SDValue(); - - // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. - unsigned Shift = 1; // Start from 2, i.e. 1 << 1. - while ((1U << Shift) < NumElems) { - if (SVOp->getMaskElt(1U << Shift) == 1) - break; - Shift += 1; - // The maximal ratio is 8, i.e. from i8 to i64. - if (Shift > 3) - return SDValue(); - } - - // Check the shuffle mask. - unsigned Mask = (1U << Shift) - 1; - for (unsigned i = 0; i != NumElems; ++i) { - int EltIdx = SVOp->getMaskElt(i); - if ((i & Mask) != 0 && EltIdx != -1) - return SDValue(); - if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) - return SDValue(); - } - - unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; - MVT NeVT = MVT::getIntegerVT(NBits); - MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift); - - if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) - return SDValue(); - - return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); -} - -static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - MVT VT = Op.getSimpleValueType(); - SDLoc dl(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - - if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget, DAG, dl); - - // Handle splat operations - if (SVOp->isSplat()) { - // Use vbroadcast whenever the splat comes from a foldable load - SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); - if (Broadcast.getNode()) - return Broadcast; - } - - // Check integer expanding shuffles. - SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - - // If the shuffle can be profitably rewritten as a narrower shuffle, then - // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 || - VT == MVT::v32i8) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) - return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); - } else if (VT.is128BitVector() && Subtarget->hasSSE2()) { - // FIXME: Figure out a cleaner way to do this. - if (ISD::isBuildVectorAllZeros(V2.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) { - MVT NewVT = NewOp.getSimpleValueType(); - if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), - NewVT, true, false)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, - dl); - } - } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { - SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG); - if (NewOp.getNode()) { - MVT NewVT = NewOp.getSimpleValueType(); - if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) - return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget, - dl); - } - } - } - return SDValue(); -} - -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - MVT VT = Op.getSimpleValueType(); - SDLoc dl(Op); - unsigned NumElems = VT.getVectorNumElements(); - bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - bool V1IsSplat = false; - bool V2IsSplat = false; - bool HasSSE2 = Subtarget->hasSSE2(); - bool HasFp256 = Subtarget->hasFp256(); - bool HasInt256 = Subtarget->hasInt256(); - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); - - // Check if we should use the experimental vector shuffle lowering. If so, - // delegate completely to that code path. - if (ExperimentalVectorShuffleLowering) - return lowerVectorShuffle(Op, Subtarget, DAG); - - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); - - if (V1IsUndef && V2IsUndef) - return DAG.getUNDEF(VT); - - // When we create a shuffle node we put the UNDEF node to second operand, - // but in some cases the first operand may be transformed to UNDEF. - // In this case we should just commute the node. - if (V1IsUndef) - return DAG.getCommutedVectorShuffle(*SVOp); - - // Vector shuffle lowering takes 3 steps: - // - // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable - // narrowing and commutation of operands should be handled. - // 2) Matching of shuffles with known shuffle masks to x86 target specific - // shuffle nodes. - // 3) Rewriting of unmatched masks into new generic shuffle operations, - // so the shuffle can be broken into other shuffles and the legalizer can - // try the lowering again. - // - // The general idea is that no vector_shuffle operation should be left to - // be matched during isel, all of them must be converted to a target specific - // node here. - - // Normalize the input vectors. Here splats, zeroed vectors, profitable - // narrowing and commutation of operands should be handled. The actual code - // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - - SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); - - // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and - // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && - V2IsUndef && MayFoldVectorLoad(V1)) - return getMOVDDup(Op, dl, V1, DAG); - - if (isMOVHLPS_v_undef_Mask(M, VT)) - return getMOVHighToLow(Op, dl, DAG); - - // Use to match splats - if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef && - (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - if (isPSHUFDMask(M, VT)) { - // The actual implementation will match the mask in the if above and then - // during isel it can match several different instructions, not only pshufd - // as its name says, sad but true, emulate the behavior for now... - if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); - - unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - - if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - - if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, - DAG); - - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, - TargetMask, DAG); - } - - if (isPALIGNRMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2, - getShufflePALIGNRImmediate(SVOp), - DAG); - - if (isVALIGNMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, - getShuffleVALIGNImmediate(SVOp), - DAG); - - // Check if this can be converted into a logical shift. - bool isLeft = false; - unsigned ShAmt = 0; - SDValue ShVal; - bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); - if (isShift && ShVal.hasOneUse()) { - // If the shifted value has multiple uses, it may be cheaper to use - // v_set0 + movlhps or movhlps, etc. - MVT EltVT = VT.getVectorElementType(); - ShAmt *= EltVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); - } - - if (isMOVLMask(M, VT)) { - if (ISD::isBuildVectorAllZeros(V1.getNode())) - return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!isMOVLPMask(M, VT)) { - if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); - - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); - } - } - - // FIXME: fold these into legal mask. - if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256)) - return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - - if (isMOVHLPSMask(M, VT)) - return getMOVHighToLow(Op, dl, DAG); - - if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - - if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - - if (isMOVLPMask(M, VT)) - return getMOVLP(Op, dl, DAG, HasSSE2); - - if (ShouldXformToMOVHLPS(M, VT) || - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) - return DAG.getCommutedVectorShuffle(*SVOp); - - if (isShift) { - // No better options. Use a vshldq / vsrldq. - MVT EltVT = VT.getVectorElementType(); - ShAmt *= EltVT.getSizeInBits(); - return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); - } - - bool Commuted = false; - // FIXME: This should also accept a bitcast of a splat? Be careful, not - // 1,1,1,1 -> v8i16 though. - BitVector UndefElements; - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode())) - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) - V1IsSplat = true; - if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode())) - if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) - V2IsSplat = true; - - // Canonicalize the splat or undef, if present, to be on the RHS. - if (!V2IsUndef && V1IsSplat && !V2IsSplat) { - CommuteVectorShuffleMask(M, NumElems); - std::swap(V1, V2); - std::swap(V1IsSplat, V2IsSplat); - Commuted = true; - } - - if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { - // Shuffling low element of v1 into undef, just return v1. - if (V2IsUndef) - return V1; - // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which - // the instruction selector will not match, so get a canonical MOVL with - // swapped operands to undo the commute. - return getMOVL(DAG, dl, VT, V2, V1); - } - - if (isUNPCKLMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - - if (isUNPCKHMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - - if (V2IsSplat) { - // Normalize mask so all entries that point to V2 points to its first - // element then try to match unpck{h|l} again. If match, return a - // new vector_shuffle with the corrected mask.p - SmallVector<int, 8> NewMask(M.begin(), M.end()); - NormalizeMask(NewMask, NumElems); - if (isUNPCKLMask(NewMask, VT, HasInt256, true)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (isUNPCKHMask(NewMask, VT, HasInt256, true)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } - - if (Commuted) { - // Commute is back and try unpck* again. - // FIXME: this seems wrong. - CommuteVectorShuffleMask(M, NumElems); - std::swap(V1, V2); - std::swap(V1IsSplat, V2IsSplat); - - if (isUNPCKLMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - - if (isUNPCKHMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } - - // Normalize the node to match x86 shuffle ops if needed - if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true))) - return DAG.getCommutedVectorShuffle(*SVOp); - - // The checks below are all present in isShuffleMaskLegal, but they are - // inlined here right now to enable us to directly emit target specific - // nodes, and remove one by one until they don't return Op anymore. - - if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && - SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - } - - if (isPSHUFHWMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, - getShufflePSHUFHWImmediate(SVOp), - DAG); - - if (isPSHUFLWMask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, - getShufflePSHUFLWImmediate(SVOp), - DAG); - - unsigned MaskValue; - if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(), - &MaskValue)) - return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG); - - if (isSHUFPMask(M, VT)) - return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, - getShuffleSHUFImmediate(SVOp), DAG); - - if (isUNPCKL_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); - if (isUNPCKH_v_undef_Mask(M, VT, HasInt256)) - return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - - //===--------------------------------------------------------------------===// - // Generate target specific nodes for 128 or 256-bit shuffles only - // supported in the AVX instruction set. - // - - // Handle VMOVDDUPY permutations - if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256)) - return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - - // Handle VPERMILPS/D* permutations - if (isVPERMILPMask(M, VT)) { - if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) - return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, - getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, - getShuffleSHUFImmediate(SVOp), DAG); - } - - unsigned Idx; - if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx)) - return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl), - Idx*(NumElems/2), DAG, dl); - - // Handle VPERM2F128/VPERM2I128 permutations - if (isVPERM2X128Mask(M, VT, HasFp256)) - return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, - V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - - if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT)) - return getINSERTPS(SVOp, dl, DAG); - - unsigned Imm8; - if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8)) - return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG); - - if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) || - VT.is512BitVector()) { - MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits()); - MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems); - SmallVector<SDValue, 16> permclMask; - for (unsigned i = 0; i != NumElems; ++i) { - permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT)); - } - - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask); - if (V2IsUndef) - // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32 - return DAG.getNode(X86ISD::VPERMV, dl, VT, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1); - return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1, - DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2); - } - - //===--------------------------------------------------------------------===// - // Since no target specific shuffle was selected for this generic one, - // lower it into other known shuffles. FIXME: this isn't true yet, but - // this is the plan. - // - - // Handle v8i16 specifically since SSE can do byte extraction and insertion. - if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v16i16 && Subtarget->hasInt256()) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v16i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - if (VT == MVT::v32i8) { - SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); - if (NewOp.getNode()) - return NewOp; - } - - // Handle all 128-bit wide vectors with 4 elements, and match them with - // several different shuffle types. - if (NumElems == 4 && VT.is128BitVector()) - return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); - - // Handle general 256-bit shuffles - if (VT.is256BitVector()) - return LowerVECTOR_SHUFFLE_256(SVOp, DAG); - - return SDValue(); -} - // This function assumes its argument is a BUILD_VECTOR of constants or // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is // true. @@ -12674,48 +10421,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend -/// instruction. -static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to a vector shuffle. +static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) - return SDValue(); if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); + auto *CondBV = cast<BuildVectorSDNode>(Cond); - // Check the mask for BLEND and build the value. - unsigned MaskValue = 0; - if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) - return SDValue(); - - // Convert i32 vectors to floating point if it is not AVX2. - // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. - MVT BlendVT = VT; - if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { - BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), - NumElems); - LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); - RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + // Only non-legal VSELECTs reach this lowering, convert those into generic + // shuffles and re-use the shuffle lowering path for blends. + SmallVector<int, 32> Mask; + for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { + SDValue CondElt = CondBV->getOperand(i); + Mask.push_back( + isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); } - - SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, - DAG.getConstant(MaskValue, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Ret); + return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -12726,28 +10454,40 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); - if (BlendOp.getNode()) + // Try to lower this to a blend-style vector shuffle. This can handle all + // constant condition cases. + if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; - // Some types for vselect were previously set to Expand, not Legal or - // Custom. Return an empty SDValue so we fall-through to Expand, after - // the Custom lowering phase. - MVT VT = Op.getSimpleValueType(); - switch (VT.SimpleTy) { + // Variable blends are only legal from SSE4.1 onward. + if (!Subtarget->hasSSE41()) + return SDValue(); + + // Only some types will be legal on some subtargets. If we can emit a legal + // VSELECT-matching blend, return Op, and but if we need to expand, return + // a null value. + switch (Op.getSimpleValueType().SimpleTy) { default: - break; + // Most of the vector types have blends past SSE4.1. + return Op; + + case MVT::v32i8: + // The byte blends for AVX vectors were introduced only in AVX2. + if (Subtarget->hasAVX2()) + return Op; + + return SDValue(); + case MVT::v8i16: case MVT::v16i16: + // AVX-512 BWI and VLX features support VSELECT with i16 elements. if (Subtarget->hasBWI() && Subtarget->hasVLX()) - break; + return Op; + + // FIXME: We should custom lower this by fixing the condition and using i8 + // blends. return SDValue(); } - - // We couldn't create a "Blend with immediate" node. - // This node should still be legal, but we'll have to emit a blendv* - // instruction. - return Op; } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { @@ -12823,6 +10563,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const MVT EltVT = Op.getSimpleValueType(); assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && + "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512 @@ -12836,13 +10578,15 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); const TargetRegisterClass* rc = getRegClassFor(VecVT); + if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) + rc = getRegClassFor(MVT::v16i1); unsigned MaxSift = rc->getSize()*8 - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, - DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, - DAG.getConstant(MaxSift, MVT::i8)); + DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -12869,10 +10613,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, getZeroVector(MaskVT, Subtarget, DAG, dl), - Idx, DAG.getConstant(0, getPointerTy())); + Idx, DAG.getConstant(0, dl, getPointerTy())); SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), - Perm, DAG.getConstant(0, getPointerTy())); + Perm, DAG.getConstant(0, dl, getPointerTy())); } return SDValue(); } @@ -12892,7 +10636,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // IdxVal -= NumElems/2; IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - DAG.getConstant(IdxVal, MVT::i32)); + DAG.getConstant(IdxVal, dl, MVT::i32)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); @@ -12934,7 +10678,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { @@ -12953,7 +10697,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } return SDValue(); @@ -12982,15 +10726,11 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (IdxVal) + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, - DAG.getConstant(IdxVal, MVT::i8)); - const TargetRegisterClass* rc = getRegClassFor(VecVT); - unsigned MaxSift = rc->getSize()*8 - 1; - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, - DAG.getConstant(MaxSift, MVT::i8)); - EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec, - DAG.getConstant(MaxSift - IdxVal, MVT::i8)); + return EltInVec; return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } @@ -13014,17 +10754,31 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - // Get the desired 128-bit vector half. + // With a 256-bit vector, we can insert into the zero element efficiently + // using a blend if we have AVX or AVX2 and the right data type. + if (VT.is256BitVector() && IdxVal == 0) { + // TODO: It is worthwhile to cast integer to floating point and back + // and incur a domain crossing penalty if that's what we'll end up + // doing anyway after extracting to a 128-bit vector. + if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget->hasAVX2() && EltVT == MVT::i32)) { + SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + N2 = DAG.getIntPtrConstant(1, dl); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + } + } + + // Get the desired 128-bit vector chunk. SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); - // Insert the element into the desired half. + // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, - DAG.getConstant(IdxIn128, MVT::i32)); + DAG.getConstant(IdxIn128, dl, MVT::i32)); - // Insert the changed part back to the 256-bit vector + // Insert the changed part back into the bigger vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); @@ -13044,22 +10798,35 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal); + N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (EltVT == MVT::f32) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into - // these - // bits. For example (insert (extract, 3), 2) could be matched by - // putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these bits. For example (insert (extract, 3), 2) could be matched by + // putting the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - N2 = DAG.getIntPtrConstant(IdxVal << 4); + + const Function *F = DAG.getMachineFunction().getFunction(); + bool MinSize = F->hasFnAttribute(Attribute::MinSize); + if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + // If this is an insertion of 32-bits into the low 32-bits of + // a vector, we prefer to generate a blend with immediate rather + // than an insertps. Blends are simpler operations in hardware and so + // will always have equal or better performance than insertps. + // But if optimizing for size and there's a load folding opportunity, + // generate insertps because blendps does not have a 32-bit memory + // operand form. + N2 = DAG.getIntPtrConstant(1, dl); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + } + N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); @@ -13080,7 +10847,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal); + N2 = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); @@ -13145,25 +10912,76 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - SDLoc dl(Op.getNode()); - SDValue Vec = Op.getNode()->getOperand(0); - SDValue SubVec = Op.getNode()->getOperand(1); - SDValue Idx = Op.getNode()->getOperand(2); - - if ((Op.getNode()->getSimpleValueType(0).is256BitVector() || - Op.getNode()->getSimpleValueType(0).is512BitVector()) && - SubVec.getNode()->getSimpleValueType(0).is128BitVector() && - isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + if (!Subtarget->hasAVX()) + return SDValue(); + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // Fold two 16-byte subvector loads into one 32-byte load: + // (insert_subvector (insert_subvector undef, (load addr), 0), + // (load addr + 16), Elts/2) + // --> load32 addr + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.is256BitVector() && SubVecVT.is128BitVector() && + !Subtarget->isUnalignedMem32Slow()) { + SDValue SubVec2 = Vec.getOperand(1); + if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { + if (Idx2->getZExtValue() == 0) { + SDValue Ops[] = { SubVec2, SubVec }; + SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false); + if (LD.getNode()) + return LD; + } } + } - if (Op.getNode()->getSimpleValueType(0).is512BitVector() && - SubVec.getNode()->getSimpleValueType(0).is256BitVector() && - isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && + SubVecVT.is128BitVector()) + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + + if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + + if (OpVT.getVectorElementType() == MVT::i1) { + if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal + return Op; + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + unsigned NumElems = OpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + + if (IdxVal == OpVT.getVectorNumElements() / 2) { + // Zero upper bits of the Vec + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); + } + if (IdxVal == 0) { + SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, + SubVec, ZeroIdx); + // Zero upper bits of the Vec2 + Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); + Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); + // Zero lower bits of the Vec + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); } } return SDValue(); @@ -13356,7 +11174,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, - DAG.getConstant(Offset, getPointerTy())); + DAG.getConstant(Offset, dl, getPointerTy())); return Result; } @@ -13471,7 +11289,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, is64Bit ? 257 : 256)); SDValue ThreadPointer = - DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr), false, false, false, 0); unsigned char OperandFlags = 0; @@ -13523,7 +11341,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); - switch (model) { case TLSModel::GeneralDynamic: if (Subtarget->is64Bit()) @@ -13613,30 +11430,36 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { SDValue TlsArray = Subtarget->is64Bit() - ? DAG.getIntPtrConstant(0x58) + ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget->isTargetWindowsGNU() - ? DAG.getIntPtrConstant(0x2C) + ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", getPointerTy())); SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, false, false, 0); - // Load the _tls_index variable - SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); - if (Subtarget->is64Bit()) - IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, - IDX, MachinePointerInfo(), MVT::i32, - false, false, false, 0); - else - IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), - false, false, false, 0); + SDValue res; + if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { + res = ThreadPointer; + } else { + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, + MachinePointerInfo(), MVT::i32, false, false, + false, 0); + else + IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), + false, false, false, 0); - SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), - getPointerTy()); - IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl, + getPointerTy()); + IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + + res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + } - SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), false, false, false, 0); @@ -13669,10 +11492,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away // during isel. SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, - DAG.getConstant(VTBits - 1, MVT::i8)); + DAG.getConstant(VTBits - 1, dl, MVT::i8)); SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, MVT::i8)) - : DAG.getConstant(0, VT); + DAG.getConstant(VTBits - 1, dl, MVT::i8)) + : DAG.getConstant(0, dl, VT); SDValue Tmp2, Tmp3; if (Op.getOpcode() == ISD::SHL_PARTS) { @@ -13687,12 +11510,12 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { // rely on the results of shld/shrd. Insert a test and select the appropriate // values for large shift amounts. SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, - DAG.getConstant(VTBits, MVT::i8)); + DAG.getConstant(VTBits, dl, MVT::i8)); SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, - AndNode, DAG.getConstant(0, MVT::i8)); + AndNode, DAG.getConstant(0, dl, MVT::i8)); SDValue Hi, Lo; - SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); + SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; @@ -13871,7 +11694,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. @@ -13879,7 +11702,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); // FP constant to bias correct the final result. - SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. @@ -13891,7 +11714,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); // Or the load with the bias. SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, @@ -13903,7 +11726,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); // Subtract the bias. SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -13913,7 +11736,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); @@ -13958,20 +11781,20 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // -- v >> 16 // Create the splat vector for 0x4b000000. - SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); + SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32); SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow, CstLow}; SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstLowArray[0], NumElts)); // Create the splat vector for 0x53000000. - SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); + SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32); SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh, CstHigh}; SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, makeArrayRef(&CstHighArray[0], NumElts)); // Create the right shift. - SDValue CstShift = DAG.getConstant(16, MVT::i32); + SDValue CstShift = DAG.getConstant(16, DL, MVT::i32); SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift, CstShift}; SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, @@ -13988,7 +11811,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, - VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); + VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = @@ -13998,9 +11821,9 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, - VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); + VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); } else { - SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); + SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32); SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, CstMask, CstMask, CstMask); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; @@ -14013,7 +11836,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). SDValue CstFAdd = DAG.getConstantFP( - APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32); SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd, CstFAdd}; SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, @@ -14048,6 +11871,11 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + case MVT::v16i8: + case MVT::v16i16: + if (Subtarget->hasAVX512()) + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } llvm_unreachable(nullptr); } @@ -14078,13 +11906,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { - SDValue WordOff = DAG.getConstant(4, getPointerTy()); + SDValue WordOff = DAG.getConstant(4, dl, getPointerTy()); SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, MachinePointerInfo(), false, false, 0); - SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), + SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo(), false, false, 0); SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); @@ -14116,8 +11944,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, MVT::i64), - ISD::SETLT); + Op.getOperand(0), + DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool( @@ -14125,8 +11953,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, getPointerTy()); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. - SDValue Zero = DAG.getIntPtrConstant(0); - SDValue Four = DAG.getIntPtrConstant(4); + SDValue Zero = DAG.getIntPtrConstant(0, dl); + SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); @@ -14138,7 +11966,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); - return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, + DAG.getIntPtrConstant(0, dl)); } std::pair<SDValue,SDValue> @@ -14241,6 +12070,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); + if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); + // Optimize vectors in AVX mode: // // v8i16 -> v8i32 @@ -14278,34 +12110,29 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, } static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16) + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) return DAG.getNode(X86ISD::VZEXT, DL, VT, In); - EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // Now we have only mask extension assert(InVT.getVectorElementType() == MVT::i1); - SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType()); - const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - - SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue One = + DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); if (VT.is512BitVector()) - return Brcst; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst); + return V; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, @@ -14327,7 +12154,7 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, MVT SVT = In.getSimpleValueType(); if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) - return LowerZERO_EXTEND_AVX512(Op, DAG); + return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); if (Subtarget->hasFp256()) { SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); @@ -14357,6 +12184,23 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); + // move vector to mask - truncate solution for SKX + if (VT.getVectorElementType() == MVT::i1) { + if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) + return Op; // legal, will go to VPMOVB2M, VPMOVW2M + if ((InVT.is256BitVector() || InVT.is128BitVector()) + && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI() && Subtarget->hasVLX()) + return Op; // legal, will go to VPMOVB2M, VPMOVW2M + if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) + return Op; // legal, will go to VPMOVD2M, VPMOVQ2M + if ((InVT.is256BitVector() || InVT.is128BitVector()) + && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI() && Subtarget->hasVLX()) + return Op; // legal, will go to VPMOVB2M, VPMOVQ2M + } if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { if (VT.getVectorElementType().getSizeInBits() >=8) return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); @@ -14370,14 +12214,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { InVT = ExtVT; } - SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); - const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); - SDValue CP = DAG.getConstantPool(C, getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld); + SDValue OneV = + DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); } @@ -14390,13 +12228,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, - DAG.getIntPtrConstant(2)); + DAG.getIntPtrConstant(2, DL)); OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi); static const int ShufMask[] = {0, 2, 4, 6}; @@ -14410,16 +12248,16 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue,32> pshufbMask; for (unsigned i = 0; i < 2; ++i) { - pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); for (unsigned j = 0; j < 8; ++j) - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); @@ -14429,15 +12267,15 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), &ShufMask[0]); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::BITCAST, DL, VT, In); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4, DL)); OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo); OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi); @@ -14476,7 +12314,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::BITCAST, DL, NVT, In), DAG.getUNDEF(NVT), &MaskVec[0]); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, @@ -14613,7 +12451,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { } // And if it is bigger, shrink it first. if (SrcVT.bitsGT(VT)) { - Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); + Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); SrcVT = VT; } @@ -14672,8 +12510,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, - DAG.getConstant(1, VT)); - return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); + DAG.getConstant(1, dl, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT)); } // Check whether an OR'd tree is PTEST-able. @@ -14791,11 +12629,11 @@ static bool hasNonFlagsUse(SDValue Op) { /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - // KORTEST instruction should be selected - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); - + if (Op.getValueType() == MVT::i1) { + SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, + DAG.getConstant(0, dl, MVT::i8)); + } // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; @@ -14817,9 +12655,8 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, case ISD::SUB: case ISD::MUL: case ISD::SHL: { - const BinaryWithFlagsSDNode *BinNode = - cast<BinaryWithFlagsSDNode>(Op.getNode()); - if (BinNode->hasNoSignedWrap()) + const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode()); + if (BinNode->Flags.hasNoSignedWrap()) break; } default: @@ -14838,7 +12675,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, // DAG.getConstant(0, MVT::i1)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); + DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; @@ -14926,7 +12763,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (!Mask.isSignedIntN(32)) // Avoid large immediates. break; SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), - DAG.getConstant(Mask, VT)); + DAG.getConstant(Mask, dl, VT)); DAG.ReplaceAllUsesWith(Op, New); Op = New; } @@ -15012,12 +12849,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (Opcode == 0) // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, - DAG.getConstant(0, Op.getValueType())); + DAG.getConstant(0, dl, Op.getValueType())); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SmallVector<SDValue, 4> Ops; - for (unsigned i = 0; i != NumOperands; ++i) - Ops.push_back(Op.getOperand(i)); + SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesWith(Op, New); @@ -15043,8 +12878,8 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::MinSize) && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -15079,7 +12914,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, - DAG.getConstant(8, MVT::i8)); + DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -15144,6 +12979,16 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, return SDValue(); } +/// If we have at least two divisions that use the same divisor, convert to +/// multplication by a reciprocal. This may need to be adjusted for a given +/// CPU if a division's cost is not at least twice the cost of a multiplication. +/// This is because we still need one division to calculate the reciprocal and +/// then we need two multiplies by that reciprocal as replacements for the +/// original divisions. +bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + return NumUsers > 1; +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); @@ -15192,7 +13037,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, // Use BT if the immediate can't be encoded in a TEST instruction. if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { LHS = AndLHS; - RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); } } @@ -15214,7 +13059,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(Cond, MVT::i8), BT); + DAG.getConstant(Cond, dl, MVT::i8), BT); } return SDValue(); @@ -15295,6 +13140,49 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); } +static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + assert(Op0.getValueType().getVectorElementType() == MVT::i1 && + "Unexpected type for boolean compare operation"); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, + DAG.getConstant(-1, dl, VT)); + SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, + DAG.getConstant(-1, dl, VT)); + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: + // (x != y) -> ~(x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, + DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), + DAG.getConstant(-1, dl, VT)); + case ISD::SETEQ: + // (x == y) -> (x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); + case ISD::SETUGT: + case ISD::SETGT: + // (x > y) -> (x & ~y) + return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); + case ISD::SETULT: + case ISD::SETLT: + // (x < y) -> (~x & y) + return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); + case ISD::SETULE: + case ISD::SETLE: + // (x <= y) -> (~x | y) + return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); + case ISD::SETUGE: + case ISD::SETGE: + // (x >=y) -> (x | ~y) + return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); + } +} + static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue Op0 = Op.getOperand(0); @@ -15332,7 +13220,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, MVT::i8)); + DAG.getConstant(SSECC, dl, MVT::i8)); } /// \brief Try to turn a VSETULT into a VSETULE by modifying its second @@ -15359,7 +13247,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) if (Val == 0) return SDValue(); - ULTOp1.push_back(DAG.getConstant(Val - 1, EVT)); + ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); } return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); @@ -15399,22 +13287,25 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC0, MVT::i8)); + DAG.getConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC1, MVT::i8)); + DAG.getConstant(CC1, dl, MVT::i8)); return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, MVT::i8)); + DAG.getConstant(SSECC, dl, MVT::i8)); } // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); - bool MaskResult = (VT.getVectorElementType() == MVT::i1); EVT OpVT = Op1.getValueType(); + if (OpVT.getVectorElementType() == MVT::i1) + return LowerBoolVSETCC_AVX512(Op, DAG); + + bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || @@ -15524,10 +13415,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // compare is always unsigned. SDValue SB; if (FlipSigns) { - SB = DAG.getConstant(0x80000000U, MVT::v4i32); + SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); } else { - SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); - SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); + SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Sign, Zero, Sign, Zero); } @@ -15582,7 +13473,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // bits of the inputs before performing those operations. if (FlipSigns) { EVT EltVT = VT.getVectorElementType(); - SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, + VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); } @@ -15650,7 +13542,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { CCode = X86::GetOppositeBranchCondition(CCode); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, MVT::i8), + DAG.getConstant(CCode, dl, MVT::i8), Op0.getOperand(1)); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); @@ -15662,18 +13554,18 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); - return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC); + return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); } bool isFP = Op1.getSimpleValueType().isFloatingPoint(); - unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), EFLAGS); + DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); return SetCC; @@ -15724,9 +13616,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op1.getValueType(); SDValue CC; - // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops - // are available. Otherwise fp cmovs get lowered into a less efficient branch - // sequence later on. + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available or VBLENDV if AVX is available. + // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && @@ -15738,17 +13630,85 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SSECC != 8) { if (Subtarget->hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, - DAG.getConstant(SSECC, MVT::i8)); + DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); } + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, - DAG.getConstant(SSECC, MVT::i8)); + DAG.getConstant(SSECC, DL, MVT::i8)); + + // If we have AVX, we can use a variable vector select (VBLENDV) instead + // of 3 logic instructions for size savings and potentially speed. + // Unfortunately, there is no scalar form of VBLENDV. + + // If either operand is a constant, don't try this. We can expect to + // optimize away at least one of the logic instructions later in that + // case, so that sequence would be faster than a variable blend. + + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + + if (Subtarget->hasAVX() && + !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { + + // Convert to vectors, do a VSELECT, and convert back to scalar. + // All of the conversions should be optimized away. + + EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); + SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); + SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); + + EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + VCmp = DAG.getNode(ISD::BITCAST, DL, VCmpVT, VCmp); + + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + VSel, DAG.getIntPtrConstant(0, DL)); + } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } + if (VT.isVector() && VT.getScalarType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, DL, VT, newSelect); + SDValue ExtVec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); + } + } + + if (VT == MVT::v4i1 || VT == MVT::v2i1) { + SDValue zeroConst = DAG.getIntPtrConstant(0, DL); + Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); + Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, + Cond, Op1, Op2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); + } + if (Cond.getOpcode() == ISD::SETCC) { SDValue NewCond = LowerSETCC(Cond, DAG); if (NewCond.getNode()) @@ -15779,21 +13739,22 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, - DAG.getConstant(0, CmpOp0.getValueType()), + DAG.getConstant(0, DL, + CmpOp0.getValueType()), CmpOp0); SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, MVT::i8), + DAG.getConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); return Res; } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, - CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, MVT::i8), Cmp); + DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); if (isAllOnes(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); @@ -15865,7 +13826,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { else Cond = X86Op.getValue(1); - CC = DAG.getConstant(X86Cond, MVT::i8); + CC = DAG.getConstant(X86Cond, DL, MVT::i8); addTest = false; } @@ -15887,7 +13848,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - CC = DAG.getConstant(X86::COND_NE, MVT::i8); + CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); } @@ -15902,7 +13863,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, MVT::i8), Cond); + DAG.getConstant(X86::COND_B, DL, MVT::i8), + Cond); if (isAllOnes(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; @@ -15931,7 +13893,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); @@ -15957,7 +13920,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget unsigned int NumElts = VT.getVectorNumElements(); - if (NumElts != 8 && NumElts != 16) + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) return SDValue(); if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { @@ -15966,22 +13929,74 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(X86ISD::VSEXT, dl, VT, In); } - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue NegOne = + DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, + ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + if (VT.is512BitVector()) + return V; + return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); +} + +static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT VT = Op->getSimpleValueType(0); + MVT InVT = In.getSimpleValueType(); + assert(VT.getSizeInBits() == InVT.getSizeInBits()); - MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32; - Constant *C = ConstantInt::get(*DAG.getContext(), - APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits())); + MVT InSVT = InVT.getScalarType(); + assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits()); - SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld); - if (VT.is512BitVector()) - return Brcst; - return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst); + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + SDLoc dl(Op); + + // SSE41 targets can use the pmovsx* instructions directly. + if (Subtarget->hasSSE41()) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. + SDValue Curr = In; + MVT CurrVT = InVT; + + // As SRAI is only available on i16/i32 types, we expand only up to i32 + // and handle i64 separately. + while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); + MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); + CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); + Curr = DAG.getNode(ISD::BITCAST, dl, CurrVT, Curr); + } + + SDValue SignExt = Curr; + if (CurrVT != InVT) { + unsigned SignExtShift = + CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(SignExtShift, dl, MVT::i8)); + } + + if (CurrVT == VT) + return SignExt; + + if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { + SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(31, dl, MVT::i8)); + SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); + return DAG.getNode(ISD::BITCAST, dl, VT, Ext); + } + + return SDValue(); } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, @@ -16039,6 +14054,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, @@ -16137,8 +14153,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, "Can only lower sext loads with a single scalar load!"); unsigned loadRegZize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz == 256) - loadRegZize /= 2; + if (Ext == ISD::SEXTLOAD && RegSz >= 256) + loadRegZize = 128; // Represent our vector as a sequence of elements which are the // largest scalar that we can load. @@ -16161,7 +14177,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, SmallVector<SDValue, 8> Chains; SDValue Ptr = Ld->getBasePtr(); SDValue Increment = - DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); + DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy()); SDValue Res = DAG.getUNDEF(LoadUnitVecVT); for (unsigned i = 0; i < NumLoads; ++i) { @@ -16177,7 +14193,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); else Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i)); + ScalarLoad, DAG.getIntPtrConstant(i, dl)); Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); } @@ -16217,7 +14233,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - MemVT.getVectorElementType().getSizeInBits(); Shuff = - DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); + DAG.getNode(ISD::SRA, dl, RegVT, Shuff, + DAG.getConstant(Amt, dl, RegVT)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; @@ -16384,7 +14401,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { else Cond = X86Op.getValue(1); - CC = DAG.getConstant(X86Cond, MVT::i8); + CC = DAG.getConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; @@ -16415,7 +14432,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, MVT::i8); + CC = DAG.getConstant(CCode, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order @@ -16433,7 +14450,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, MVT::i8); + CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -16446,7 +14463,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, MVT::i8); + CC = DAG.getConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && @@ -16472,10 +14489,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, MVT::i8); + CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_P, MVT::i8); + CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -16502,10 +14519,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, MVT::i8); + CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_NP, MVT::i8); + CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); Cond = Cmp; addTest = false; Dest = FalseBB; @@ -16533,7 +14550,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; - CC = DAG.getConstant(X86Cond, MVT::i8); + CC = DAG.getConstant(X86Cond, dl, MVT::i8); Cond = EmitTest(Cond, X86Cond, dl, DAG); } Cond = ConvertCmpIfNecessary(Cond, DAG); @@ -16570,23 +14587,23 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true), + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), SDLoc(Node)); SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, VT)); + DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true), - DAG.getIntPtrConstant(0, true), SDValue(), + Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), SDLoc(Node)); SDValue Ops[2] = { Tmp1, Tmp2 }; @@ -16635,15 +14652,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Align) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, VT)); + DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } @@ -16678,22 +14694,22 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { // Store gp_offset SDValue Store = DAG.getStore(Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), - MVT::i32), + DL, MVT::i32), FIN, MachinePointerInfo(SV), false, false, 0); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4)); + FIN, DAG.getIntPtrConstant(4, DL)); Store = DAG.getStore(Op.getOperand(0), DL, - DAG.getConstant(FuncInfo->getVarArgsFPOffset(), + DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4), false, false, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(4)); + FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), getPointerTy()); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, @@ -16703,7 +14719,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), - FIN, DAG.getIntPtrConstant(8)); + FIN, DAG.getIntPtrConstant(8, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, @@ -16745,22 +14761,17 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!DAG.getTarget().Options.UseSoftFloat && - !(DAG.getMachineFunction() - .getFunction()->getAttributes() - .hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat)) && + assert(!Subtarget->useSoftFloat() && + !(DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG // VAARG_64 returns two values: Variable Argument Address, Chain - SmallVector<SDValue, 11> InstOps; - InstOps.push_back(Chain); - InstOps.push_back(SrcPtr); - InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); - InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); - InstOps.push_back(DAG.getConstant(Align, MVT::i32)); + SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), + DAG.getConstant(ArgMode, dl, MVT::i8), + DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, @@ -16791,8 +14802,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SDLoc DL(Op); return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, - DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, - false, + DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, + false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } @@ -16812,7 +14823,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else - return DAG.getConstant(0, VT); + return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) @@ -16837,7 +14848,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, } ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType)); + Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRLI: @@ -16849,7 +14860,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, } ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType)); + Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); } break; case X86ISD::VSRAI: @@ -16861,7 +14872,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, } ND = cast<ConstantSDNode>(CurrentOp); const APInt &C = ND->getAPIntValue(); - Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType)); + Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); } break; } @@ -16869,7 +14880,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); } - return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8)); + return DAG.getNode(Opc, dl, VT, SrcOp, + DAG.getConstant(ShiftAmt, dl, MVT::i8)); } // getTargetVShiftNode - Handle vector element shifts where the shift amount @@ -16894,7 +14906,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, } const X86Subtarget &Subtarget = - DAG.getTarget().getSubtarget<X86Subtarget>(); + static_cast<const X86Subtarget &>(DAG.getSubtarget()); if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { // Let the shuffle legalizer expand this shift amount node. @@ -16907,7 +14919,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SmallVector<SDValue, 4> ShOps; ShOps.push_back(ShAmt); if (SVT == MVT::i32) { - ShOps.push_back(DAG.getConstant(0, SVT)); + ShOps.push_back(DAG.getConstant(0, dl, SVT)); ShOps.push_back(DAG.getUNDEF(SVT)); } ShOps.push_back(DAG.getUNDEF(SVT)); @@ -16948,7 +14960,7 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, // are extracted by EXTRACT_SUBVECTOR. SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); switch (Op.getOpcode()) { default: break; @@ -16987,54 +14999,6 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } -static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_mask_vfmadd_ps_512: - case Intrinsic::x86_fma_mask_vfmadd_pd_512: - return X86ISD::FMADD; - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_mask_vfmsub_ps_512: - case Intrinsic::x86_fma_mask_vfmsub_pd_512: - return X86ISD::FMSUB; - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_mask_vfnmadd_ps_512: - case Intrinsic::x86_fma_mask_vfnmadd_pd_512: - return X86ISD::FNMADD; - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_mask_vfnmsub_ps_512: - case Intrinsic::x86_fma_mask_vfnmsub_pd_512: - return X86ISD::FNMSUB; - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: - case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: - return X86ISD::FMADDSUB; - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: - case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: - return X86ISD::FMSUBADD; - } -} - static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -17065,15 +15029,68 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - SDValue RoundingMode = Op.getOperand(5); + // There are 2 kinds of intrinsics in this group: + // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands + // (2) With rounding mode and sae - 7 operands. + if (Op.getNumOperands() == 6) { + SDValue Sae = Op.getOperand(5); + unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, + Sae), + Mask, Src0, Subtarget, DAG); + } + assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); + SDValue RoundingMode = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, - RoundingMode), + RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), - Op.getOperand(2)), - Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1,Src2), + Mask, PassThru, Subtarget, DAG); + } + case FMA_OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, Src1, Subtarget, DAG); + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src1, Src2, Src3), + Mask, Src1, Subtarget, DAG); } case CMP_MASK: case CMP_MASK_CC: { @@ -17094,30 +15111,46 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask.getValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { - Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); + SDValue CC = Op.getOperand(3); + CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC); + } else { assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2)); + Op.getOperand(2)); } SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, - DAG.getTargetConstant(0, MaskVT), + DAG.getTargetConstant(0, dl, + MaskVT), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG); assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); + DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case VSHIFT: @@ -17144,7 +15177,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDLoc dl(Op); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, PassThru); @@ -17159,20 +15192,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDLoc dl(Op); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } - case FMA_OP_MASK: - { - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, - dl, Op.getValueType(), - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)), - Op.getOperand(4), Op.getOperand(1), - Subtarget, DAG); - } default: break; } @@ -17259,7 +15282,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue RHS = Op.getOperand(2); unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); - SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -17268,7 +15291,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1)); SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2)); - SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); @@ -17333,7 +15356,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), + DAG.getConstant(X86CC, dl, MVT::i8), SDValue(PCMP.getNode(), 1)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } @@ -17351,57 +15374,23 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return DAG.getNode(Opcode, dl, VTs, NewOps); } - case Intrinsic::x86_fma_mask_vfmadd_ps_512: - case Intrinsic::x86_fma_mask_vfmadd_pd_512: - case Intrinsic::x86_fma_mask_vfmsub_ps_512: - case Intrinsic::x86_fma_mask_vfmsub_pd_512: - case Intrinsic::x86_fma_mask_vfnmadd_ps_512: - case Intrinsic::x86_fma_mask_vfnmadd_pd_512: - case Intrinsic::x86_fma_mask_vfnmsub_ps_512: - case Intrinsic::x86_fma_mask_vfnmsub_pd_512: - case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: - case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: - case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: - case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { - auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); - if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) - return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), - dl, Op.getValueType(), - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)), - Op.getOperand(4), Op.getOperand(1), - Subtarget, DAG); - else - return SDValue(); - } + case Intrinsic::x86_seh_lsda: { + // Compute the symbol for the LSDA. We know it'll get emitted later. + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Op1 = Op.getOperand(1); + Op1->dump(); + auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); + MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + StringRef Name = LSDASym->getName(); + assert(Name.data()[Name.size()] == '\0' && "not null terminated"); - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + // Generate a simple absolute symbol reference. This intrinsic is only + // supported on 32-bit Windows, which isn't PIC. + SDValue Result = + DAG.getTargetExternalSymbol(Name.data(), VT, X86II::MO_NOPREFIX); + return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); + } } } @@ -17412,17 +15401,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); - SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); @@ -17438,15 +15427,15 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); - SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); @@ -17461,15 +15450,15 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); assert(C && "Invalid scale type"); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8); - SDValue Disp = DAG.getTargetConstant(0, MVT::i32); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) - MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT); + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask); //SDVTList VTs = DAG.getVTList(MVT::Other); @@ -17510,7 +15499,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL, // The EAX register is loaded with the low-order 32 bits. The EDX register // is loaded with the supported high-order bits of the counter. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, MVT::i8)); + DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; @@ -17564,7 +15553,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, // The EDX register is loaded with the high-order 32 bits of the MSR, and // the EAX register is loaded with the low-order 32 bits. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, - DAG.getConstant(32, MVT::i8)); + DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return; @@ -17609,8 +15598,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), - DAG.getConstant(1, Op->getValueType(1)), - DAG.getConstant(X86::COND_B, MVT::i32), + DAG.getConstant(1, dl, Op->getValueType(1)), + DAG.getConstant(X86::COND_B, dl, MVT::i32), SDValue(Result.getNode(), 1) }; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, DAG.getVTList(Op->getValueType(1), MVT::Glue), @@ -17628,8 +15617,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, - Subtarget); + return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); @@ -17639,14 +15628,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); - unsigned HintVal; - if (dyn_cast<ConstantSDNode> (Hint) == nullptr || - (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) - llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); + unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); + assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); @@ -17658,7 +15646,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector<SDValue, 2> Results; - getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); + getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, + Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. @@ -17672,7 +15661,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_NE, MVT::i8), + DAG.getConstant(X86::COND_NE, dl, MVT::i8), InTrans); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), @@ -17684,14 +15673,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), - DAG.getConstant(-1, MVT::i8)); + DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), Op.getOperand(4), GenCF.getValue(1)); SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), Op.getOperand(5), MachinePointerInfo(), false, false, 0); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_B, MVT::i8), + DAG.getConstant(X86::COND_B, dl, MVT::i8), Res.getValue(1)); Results.push_back(SetCC); Results.push_back(Store); @@ -17715,7 +15704,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask.getValueType().getSizeInBits()); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, DAG.getUNDEF(VT)); @@ -17739,15 +15728,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask.getValueType().getSizeInBits()); SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, 0); - SmallVector<SDValue, 2> Results; - Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, - PathThru)); - Results.push_back(Chain); + SDValue Results[] = { + DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru), + Chain}; return DAG.getMergeValues(Results, dl); } } @@ -17767,9 +15755,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); - SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), @@ -17783,16 +15770,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + EVT VT = Op.getValueType(); + MFI->setFrameAddressIsTaken(true); - EVT VT = Op.getValueType(); + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { + // Depth > 0 makes no sense on targets which use Windows unwind codes. It + // is not possible to crawl up the stack without looking at the unwind codes + // simultaneously. + int FrameAddrIndex = FuncInfo->getFAIndex(); + if (!FrameAddrIndex) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( + SlotSize, /*Offset=*/0, /*IsImmutable=*/false); + FuncInfo->setFAIndex(FrameAddrIndex); + } + return DAG.getFrameIndex(FrameAddrIndex, VT); + } + + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); - unsigned FrameReg = RegInfo->getPtrSizedFrameRegister( - DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -17819,9 +15823,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); - return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { @@ -17831,8 +15834,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - DAG.getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -17841,7 +15843,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, - DAG.getIntPtrConstant(RegInfo->getSlotSize())); + DAG.getIntPtrConstant(RegInfo->getSlotSize(), + dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); @@ -17879,7 +15882,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -17896,12 +15899,12 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; - OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(2, MVT::i64)); + DAG.getConstant(2, dl, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), false, false, 2); @@ -17910,13 +15913,13 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(10, MVT::i64)); - OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + DAG.getConstant(10, dl, MVT::i64)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(12, MVT::i64)); + DAG.getConstant(12, dl, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), false, false, 2); @@ -17924,16 +15927,16 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(20, MVT::i64)); - OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + DAG.getConstant(20, dl, MVT::i64)); + OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20), false, false, 0); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, - DAG.getConstant(22, MVT::i64)); - OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, - MachinePointerInfo(TrmpAddr, 22), + DAG.getConstant(22, dl, MVT::i64)); + OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), + Addr, MachinePointerInfo(TrmpAddr, 22), false, false, 0); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); @@ -17986,32 +15989,32 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, - DAG.getConstant(10, MVT::i32)); + DAG.getConstant(10, dl, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, - DAG.getConstant(MOV32ri|N86Reg, MVT::i8), + DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, - DAG.getConstant(1, MVT::i32)); + DAG.getConstant(1, dl, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), false, false, 1); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, - DAG.getConstant(5, MVT::i32)); - OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, - MachinePointerInfo(TrmpAddr, 5), + DAG.getConstant(5, dl, MVT::i32)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), + Addr, MachinePointerInfo(TrmpAddr, 5), false, false, 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, - DAG.getConstant(6, MVT::i32)); + DAG.getConstant(6, dl, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), false, false, 1); @@ -18042,8 +16045,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, */ MachineFunction &MF = DAG.getMachineFunction(); - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -18069,20 +16071,20 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue CWD1 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, - CWD, DAG.getConstant(0x800, MVT::i16)), - DAG.getConstant(11, MVT::i8)); + CWD, DAG.getConstant(0x800, DL, MVT::i16)), + DAG.getConstant(11, DL, MVT::i8)); SDValue CWD2 = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, - CWD, DAG.getConstant(0x400, MVT::i16)), - DAG.getConstant(9, MVT::i8)); + CWD, DAG.getConstant(0x400, DL, MVT::i16)), + DAG.getConstant(9, DL, MVT::i8)); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i16, DAG.getNode(ISD::ADD, DL, MVT::i16, DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), - DAG.getConstant(1, MVT::i16)), - DAG.getConstant(3, MVT::i16)); + DAG.getConstant(1, DL, MVT::i16)), + DAG.getConstant(3, DL, MVT::i16)); return DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); @@ -18108,14 +16110,15 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = { Op, - DAG.getConstant(NumBits+NumBits-1, OpVT), - DAG.getConstant(X86::COND_E, MVT::i8), + DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), + DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); // Finally xor with NumBits-1. - Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, + DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); @@ -18140,7 +16143,8 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); // And xor with NumBits-1. - Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, + DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); @@ -18160,8 +16164,8 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, - DAG.getConstant(NumBits, VT), - DAG.getConstant(X86::COND_E, MVT::i8), + DAG.getConstant(NumBits, dl, VT), + DAG.getConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1) }; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); @@ -18197,6 +16201,9 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -18204,6 +16211,9 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -18215,6 +16225,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + if (VT == MVT::i1) + return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); + // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); @@ -18222,6 +16235,79 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); + // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector + // pairs, multiply and truncate. + if (VT == MVT::v16i8 || VT == MVT::v32i8) { + if (Subtarget->hasInt256()) { + if (VT == MVT::v32i8) { + MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2); + SDValue Lo = DAG.getIntPtrConstant(0, dl); + SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); + SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo); + SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo); + SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi); + SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo), + DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi)); + } + + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + return DAG.getNode( + ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::MUL, dl, ExVT, + DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), + DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); + } + + assert(VT == MVT::v16i8 && + "Pre-AVX2 support only supports v16i8 multiplication"); + MVT ExVT = MVT::v8i16; + + // Extract the lo parts and sign extend to i16 + SDValue ALo, BLo; + if (Subtarget->hasSSE41()) { + ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); + BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + } else { + const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, + -1, 4, -1, 5, -1, 6, -1, 7}; + ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo); + BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo); + ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); + BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); + } + + // Extract the hi parts and sign extend to i16 + SDValue AHi, BHi; + if (Subtarget->hasSSE41()) { + const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); + BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + } else { + const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, + -1, 12, -1, 13, -1, 14, -1, 15}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi); + BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi); + AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); + BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); + } + + // Multiply, mask the lower 8bits of the lo/hi results and pack + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); + RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && @@ -18394,7 +16480,8 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, // unsigned multiply. if (IsSigned && !Subtarget->hasSSE41()) { SDValue ShAmt = - DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); + DAG.getConstant(31, dl, + DAG.getTargetLoweringInfo().getShiftAmountTy(VT)); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, @@ -18410,6 +16497,53 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, dl); } +// Return true if the requred (according to Opcode) shift-imm form is natively +// supported by the Subtarget +static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + if (VT.getScalarSizeInBits() < 16) + return false; + + if (VT.is512BitVector() && + (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) + return true; + + bool LShift = VT.is128BitVector() || + (VT.is256BitVector() && Subtarget->hasInt256()); + + bool AShift = LShift && (Subtarget->hasVLX() || + (VT != MVT::v2i64 && VT != MVT::v4i64)); + return (Opcode == ISD::SRA) ? AShift : LShift; +} + +// The shift amount is a variable, but it is the same for all vector lanes. +// These instrcutions are defined together with shift-immediate. +static +bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); +} + +// Return true if the requred (according to Opcode) variable-shift form is +// natively supported by the Subtarget +static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + + if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) + return false; + + // vXi16 supported only on AVX-512, BWI + if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI()) + return false; + + if (VT.is512BitVector() || Subtarget->hasVLX()) + return true; + + bool LShift = VT.is128BitVector() || VT.is256BitVector(); + bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; + return (Opcode == ISD::SRA) ? AShift : LShift; +} + static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -18417,97 +16551,44 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + // Optimize shl/srl/sra with constant shift amount. if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { uint64_t ShiftAmt = ShiftConst->getZExtValue(); - if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || - (Subtarget->hasInt256() && - (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) || - (Subtarget->hasAVX512() && - (VT == MVT::v8i64 || VT == MVT::v16i32))) { - if (Op.getOpcode() == ISD::SHL) - return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, - DAG); - if (Op.getOpcode() == ISD::SRL) - return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, - DAG); - if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, - DAG); - } + if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - if (VT == MVT::v16i8) { - if (Op.getOpcode() == ISD::SHL) { - // Make a large shift. - SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, - MVT::v8i16, R, ShiftAmt, - DAG); - SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); - // Zero out the rightmost bits. - SmallVector<SDValue, 16> V(16, - DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); - } - if (Op.getOpcode() == ISD::SRL) { - // Make a large shift. - SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, - MVT::v8i16, R, ShiftAmt, - DAG); - SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); - // Zero out the leftmost bits. - SmallVector<SDValue, 16> V(16, - DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); - } - if (Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); - } + if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + unsigned NumElts = VT.getVectorNumElements(); + MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - // R s>> a === ((R u>> a) ^ m) - m - SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); - Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); - Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); - return Res; - } - llvm_unreachable("Unknown shift opcode."); - } - - if (Subtarget->hasInt256() && VT == MVT::v32i8) { if (Op.getOpcode() == ISD::SHL) { + // Simple i8 add case + if (ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Make a large shift. - SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, - MVT::v16i16, R, ShiftAmt, - DAG); + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, + R, ShiftAmt, DAG); SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V(32, - DAG.getConstant(uint8_t(-1U << ShiftAmt), - MVT::i8)); + SmallVector<SDValue, 32> V( + NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. - SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, - MVT::v16i16, R, ShiftAmt, - DAG); + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, + R, ShiftAmt, DAG); SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V(32, - DAG.getConstant(uint8_t(-1U) >> ShiftAmt, - MVT::i8)); + SmallVector<SDValue, 32> V( + NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); } @@ -18520,8 +16601,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // R s>> a === ((R u>> a) ^ m) - m SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, - MVT::i8)); + SmallVector<SDValue, 32> V(NumElts, + DAG.getConstant(128 >> ShiftAmt, dl, + MVT::i8)); SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); @@ -18563,19 +16645,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, if (ShAmt != ShiftAmt) return SDValue(); } - switch (Op.getOpcode()) { - default: - llvm_unreachable("Unknown shift opcode!"); - case ISD::SHL: - return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt, - DAG); - case ISD::SRL: - return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, - DAG); - case ISD::SRA: - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt, - DAG); - } + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); } return SDValue(); @@ -18588,12 +16658,13 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) || - VT == MVT::v4i32 || VT == MVT::v8i16 || - (Subtarget->hasInt256() && - ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) || - VT == MVT::v8i32 || VT == MVT::v16i16)) || - (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) { + unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + + unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; + + if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); @@ -18626,7 +16697,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (!BaseShAmt) // Avoid introducing an extract element from a shuffle. BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, - DAG.getIntPtrConstant(SplatIdx)); + DAG.getIntPtrConstant(SplatIdx, dl)); } } @@ -18637,54 +16708,12 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); - switch (Op.getOpcode()) { - default: - llvm_unreachable("Unknown shift opcode!"); - case ISD::SHL: - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - case MVT::v16i32: - case MVT::v8i64: - return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG); - } - case ISD::SRA: - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v4i32: - case MVT::v8i16: - case MVT::v8i32: - case MVT::v16i16: - case MVT::v16i32: - case MVT::v8i64: - return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG); - } - case ISD::SRL: - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v2i64: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v4i64: - case MVT::v8i32: - case MVT::v16i16: - case MVT::v16i32: - case MVT::v8i64: - return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG); - } - } + return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); } } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) || - (Subtarget->hasAVX512() && VT == MVT::v8i64)) && + if (!Subtarget->is64Bit() && VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); @@ -18698,18 +16727,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (Vals[j] != Amt.getOperand(i + j)) return SDValue(); } - switch (Op.getOpcode()) { - default: - llvm_unreachable("Unknown shift opcode!"); - case ISD::SHL: - return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1)); - case ISD::SRL: - return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1)); - case ISD::SRA: - return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1)); - } + return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); } - return SDValue(); } @@ -18719,33 +16738,28 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - SDValue V; assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); - V = LowerScalarImmediateShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) return V; - V = LowerScalarVariableShift(Op, DAG, Subtarget); - if (V.getNode()) + if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) return V; - if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64)) + if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; - // AVX2 has VPSLLV/VPSRAV/VPSRLV. - if (Subtarget->hasInt256()) { - if (Op.getOpcode() == ISD::SRL && - (VT == MVT::v2i64 || VT == MVT::v4i32 || - VT == MVT::v4i64 || VT == MVT::v8i32)) - return Op; - if (Op.getOpcode() == ISD::SHL && - (VT == MVT::v2i64 || VT == MVT::v4i32 || - VT == MVT::v4i64 || VT == MVT::v8i32)) - return Op; - if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32)) - return Op; + + // 2i64 vector logical shifts can efficiently avoid scalarization - do the + // shifts per-lane and then shuffle the partial results back together. + if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { + // Splat the shift amounts so the scalar shifts above will catch it. + SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); + SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); + SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); + return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } // If possible, lower this packed shift into a vector multiply instead of @@ -18775,7 +16789,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, Elts.push_back(DAG.getUNDEF(SVT)); continue; } - Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT)); + Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); } SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); return DAG.getNode(ISD::MUL, dl, VT, R, BV); @@ -18783,9 +16797,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // Lower SHL with variable shift amount. if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT)); + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); - Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT)); + Op = DAG.getNode(ISD::ADD, dl, VT, Op, + DAG.getConstant(0x3f800000U, dl, VT)); Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); @@ -18849,10 +16864,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // Replace this node with two shifts followed by a MOVSS/MOVSD. EVT CastVT = MVT::v4i32; SDValue Splat1 = - DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); + DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = - DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); + DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) CastVT = MVT::v2i64; @@ -18865,24 +16880,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); + // Turn 'a' into a mask suitable for VSELECT: a = a << 5; + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT)); - // a = a << 5; - Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT)); - Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); - - // Turn 'a' into a mask suitable for VSELECT - SDValue VSelM = DAG.getConstant(0x80, VT); + SDValue VSelM = DAG.getConstant(0x80, dl, VT); SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - SDValue CM1 = DAG.getConstant(0x0f, VT); - SDValue CM2 = DAG.getConstant(0x3f, VT); - - // r = VSELECT(r, psllw(r & (char16)15, 4), a); - SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); - M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG); - M = DAG.getNode(ISD::BITCAST, dl, VT, M); + // r = VSELECT(r, shl(r, 4), a); + SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT)); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); // a += a @@ -18890,10 +16896,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); - // r = VSELECT(r, psllw(r & (char16)63, 2), a); - M = DAG.getNode(ISD::AND, dl, VT, R, CM2); - M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG); - M = DAG.getNode(ISD::BITCAST, dl, VT, M); + // r = VSELECT(r, shl(r, 2), a); + M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT)); R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); // a += a @@ -18911,14 +16915,32 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // the extra overheads to get from v16i8 to v8i32 make the existing SSE // solution better. if (Subtarget->hasInt256() && VT == MVT::v8i16) { - MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16; + MVT ExtVT = MVT::v8i32; unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - R = DAG.getNode(ExtOpc, dl, NewVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt); + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt)); - } + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + + if (Subtarget->hasInt256() && VT == MVT::v16i16) { + MVT ExtVT = MVT::v8i32; + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); + ALo = DAG.getNode(ISD::BITCAST, dl, ExtVT, ALo); + AHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, AHi); + RLo = DAG.getNode(ISD::BITCAST, dl, ExtVT, RLo); + RHi = DAG.getNode(ISD::BITCAST, dl, ExtVT, RHi); + SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); + Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); + Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); + } // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.is256BitVector()) { @@ -18934,12 +16956,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, SDValue Amt1, Amt2; if (Amt.getOpcode() == ISD::BUILD_VECTOR) { // Constant shift amount - SmallVector<SDValue, 4> Amt1Csts; - SmallVector<SDValue, 4> Amt2Csts; - for (unsigned i = 0; i != NumElems/2; ++i) - Amt1Csts.push_back(Amt->getOperand(i)); - for (unsigned i = NumElems/2; i != NumElems; ++i) - Amt2Csts.push_back(Amt->getOperand(i)); + SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); + ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); + ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); @@ -19021,7 +17040,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(X86::COND_O, MVT::i32), + DAG.getConstant(X86::COND_O, DL, MVT::i32), SDValue(Sum.getNode(), 2)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); @@ -19034,87 +17053,23 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), - DAG.getConstant(Cond, MVT::i32), + DAG.getConstant(Cond, DL, MVT::i32), SDValue(Sum.getNode(), 1)); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -// Sign extension of the low part of vector elements. This may be used either -// when sign extend instructions are not available or if the vector element -// sizes already match the sign-extended size. If the vector elements are in -// their pre-extended size and sign extend instructions are available, that will -// be handled by LowerSIGN_EXTEND. -SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); - MVT VT = Op.getSimpleValueType(); - - if (!Subtarget->hasSSE2() || !VT.isVector()) - return SDValue(); - - unsigned BitsDiff = VT.getScalarType().getSizeInBits() - - ExtraVT.getScalarType().getSizeInBits(); - - switch (VT.SimpleTy) { - default: return SDValue(); - case MVT::v8i32: - case MVT::v16i16: - if (!Subtarget->hasFp256()) - return SDValue(); - if (!Subtarget->hasInt256()) { - // needs to be split - unsigned NumElems = VT.getVectorNumElements(); - - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); - - MVT EltVT = VT.getVectorElementType(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); - ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, - ExtraNumElems/2); - SDValue Extra = DAG.getValueType(ExtraVT); - - LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); - LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); - } - // fall through - case MVT::v4i32: - case MVT::v8i16: { - SDValue Op0 = Op.getOperand(0); - - // This is a sign extension of some low part of vector elements without - // changing the size of the vector elements themselves: - // Shift-Left + Shift-Right-Algebraic. - SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, - BitsDiff, DAG); - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, - DAG); - } - } -} - /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b else if (OpWidth == 128) - return Subtarget.hasCmpxchg16b(); + return Subtarget->hasCmpxchg16b(); else return false; } @@ -19130,16 +17085,17 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { return needsCmpXchgNb(PTy->getElementType()); } -bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; +TargetLoweringBase::AtomicRMWExpansionKind +X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. - if (MemType->getPrimitiveSizeInBits() > NativeWidth) - return needsCmpXchgNb(MemType); + if (MemType->getPrimitiveSizeInBits() > NativeWidth) { + return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; + } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { @@ -19149,13 +17105,14 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return false; + return AtomicRMWExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty(); + return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg + : AtomicRMWExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -19163,7 +17120,7 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return true; + return AtomicRMWExpansionKind::CmpXChg; } } @@ -19176,9 +17133,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { - const X86Subtarget &Subtarget = - getTargetMachine().getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; const Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually @@ -19210,21 +17165,21 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more agressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SynchScope == SingleThread) { + if (SynchScope == SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; - } else if (hasMFENCE(Subtarget)) { - Function *MFence = llvm::Intrinsic::getDeclaration(M, - Intrinsic::x86_sse2_mfence); - Builder.CreateCall(MFence); - } else { + + if (!hasMFENCE(*Subtarget)) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; - } + + Function *MFence = + llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, @@ -19250,13 +17205,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue Zero = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { - DAG.getRegister(X86::ESP, MVT::i32), // Base - DAG.getTargetConstant(1, MVT::i8), // Scale - DAG.getRegister(0, MVT::i32), // Index - DAG.getTargetConstant(0, MVT::i32), // Disp - DAG.getRegister(0, MVT::i32), // Segment. + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, dl, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, dl, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. Zero, Chain }; @@ -19289,7 +17244,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), - DAG.getTargetConstant(size, MVT::i8), + DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); @@ -19301,7 +17256,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), - DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + DAG.getConstant(X86::COND_E, DL, MVT::i8), + EFLAGS); DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); @@ -19330,18 +17286,16 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SmallVector<SDValue, 16> Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i))); + DAG.getIntPtrConstant(i, dl))); // Explicitly mark the extra elements as Undef. - SDValue Undef = DAG.getUNDEF(SVT); - for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i) - Elts.push_back(Undef); + Elts.append(NumElts, DAG.getUNDEF(SVT)); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, dl)); } assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && @@ -19396,12 +17350,15 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, bool NeedsBitcast = EltVT == MVT::i32; MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; - SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); - SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); - SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); + SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, + EltVT); + SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, + EltVT); + SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, + EltVT); // v = v - ((v >> 1) & 0x55555555...) - SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT)); + SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, dl, EltVT)); SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); if (NeedsBitcast) @@ -19420,7 +17377,7 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) SmallVector<SDValue, 8> Mask33(NumElts, Cst33); SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); - SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT)); + SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, dl, EltVT)); SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); @@ -19439,7 +17396,7 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... - SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT)); + SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, dl, EltVT)); SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); @@ -19472,7 +17429,7 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, Add = And; SmallVector<SDValue, 8> Csts; for (unsigned i = 8; i <= Len/2; i *= 2) { - Csts.assign(NumElts, DAG.getConstant(i, EltVT)); + Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT)); SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); @@ -19480,7 +17437,8 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, } // The result is on the least significant 6-bits on i32 and 7-bits on i64. - SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); + SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl, + EltVT); SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F); SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); if (NeedsBitcast) { @@ -19499,7 +17457,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Node); EVT T = Node->getValueType(0); SDValue negOp = DAG.getNode(ISD::SUB, dl, T, - DAG.getConstant(0, T), Node->getOperand(2)); + DAG.getConstant(0, dl, T), Node->getOperand(2)); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), Node->getOperand(0), @@ -19605,19 +17563,110 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, - CallResult.first, DAG.getIntPtrConstant(0)); + CallResult.first, DAG.getIntPtrConstant(0, dl)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, - CallResult.first, DAG.getIntPtrConstant(1)); + CallResult.first, DAG.getIntPtrConstant(1, dl)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX512() && + "MGATHER/MSCATTER are supported on AVX-512 arch only"); + + MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); + EVT VT = N->getValue().getValueType(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); + SDLoc dl(Op); + + // X86 scatter kills mask register, so its type should be added to + // the list of return values + if (N->getNumValues() == 1) { + SDValue Index = N->getIndex(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getValueType().is512BitVector()) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + + SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); + } + return Op; +} + +static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX512() && + "MGATHER/MSCATTER are supported on AVX-512 arch only"); + + MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); + EVT VT = Op.getValueType(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + SDLoc dl(Op); + + SDValue Index = N->getIndex(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getValueType().is512BitVector()) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + } + return Op; +} + +SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Eventually, the lowering of these nodes should be informed by or + // deferred to the GC strategy for the function in which they appear. For + // now, however, they must be lowered to something. Since they are logically + // no-ops in the case of a null GC strategy (or a GC strategy which does not + // require special handling for these nodes), lower them as literal NOOPs for + // the time being. + SmallVector<SDValue, 2> Ops; + + Ops.push_back(Op.getOperand(0)); + if (Op->getGluedNode()) + Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + + SDLoc OpDL(Op); + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + + return NOOP; +} + +SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Eventually, the lowering of these nodes should be informed by or + // deferred to the GC strategy for the function in which they appear. For + // now, however, they must be lowered to something. Since they are logically + // no-ops in the case of a null GC strategy (or a GC strategy which does not + // require special handling for these nodes), lower them as literal NOOPs for + // the time being. + SmallVector<SDValue, 2> Ops; + + Ops.push_back(Op.getOperand(0)); + if (Op->getGluedNode()) + Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + + SDLoc OpDL(Op); + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + + return NOOP; +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); - case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); @@ -19625,8 +17674,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); @@ -19647,6 +17696,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); @@ -19700,6 +17751,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); + case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); + case ISD::GC_TRANSITION_START: + return LowerGC_TRANSITION_START(Op, DAG); + case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); } } @@ -19747,6 +17803,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: + // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert + // (FP_TO_SINT (load f16)) to FP_TO_INT*. + if (N->getOperand(0).getValueType() == MVT::f16) + break; + // fallthrough case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; @@ -19775,7 +17836,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N->getOperand(0)); - SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, @@ -19792,6 +17853,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V); return; } + case ISD::FP_EXTEND: { + // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. + // No other ValueType for FP_EXTEND should reach this point. + assert(N->getValueType(0) == MVT::v2f32 && + "Do not know how to legalize this Node"); + return; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); switch (IntNo) { @@ -19818,9 +17886,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), - DAG.getConstant(0, HalfT)); + DAG.getConstant(0, dl, HalfT)); cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), - DAG.getConstant(1, HalfT)); + DAG.getConstant(1, dl, HalfT)); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); @@ -19829,9 +17897,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), - DAG.getConstant(0, HalfT)); + DAG.getConstant(0, dl, HalfT)); swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), - DAG.getConstant(1, HalfT)); + DAG.getConstant(1, dl, HalfT)); swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RBX : X86::EBX, swapInL, cpInH.getValue(1)); @@ -19858,7 +17926,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::i32, cpOutH.getValue(2)); SDValue Success = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); @@ -19908,7 +17976,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVector<SDValue, 8> Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, - ToVecInt, DAG.getIntPtrConstant(i))); + ToVecInt, DAG.getIntPtrConstant(i, dl))); Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); } @@ -19916,8 +17984,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: return nullptr; + switch ((X86ISD::NodeType)Opcode) { + case X86ISD::FIRST_NUMBER: break; case X86ISD::BSF: return "X86ISD::BSF"; case X86ISD::BSR: return "X86ISD::BSR"; case X86ISD::SHLD: return "X86ISD::SHLD"; @@ -19944,9 +18012,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; case X86ISD::CMPMU: return "X86ISD::CMPMU"; + case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; case X86ISD::FSETCC: return "X86ISD::FSETCC"; + case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -19955,16 +18025,21 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; + case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; + case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; + case X86ISD::ADDUS: return "X86ISD::ADDUS"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -19975,7 +18050,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SMAX: return "X86ISD::SMAX"; case X86ISD::SMIN: return "X86ISD::SMIN"; case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -20057,8 +18134,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; + case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; + case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; @@ -20071,6 +18149,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; + case X86ISD::MFENCE: return "X86ISD::MFENCE"; + case X86ISD::SFENCE: return "X86ISD::SFENCE"; + case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; @@ -20082,13 +18163,31 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; + case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; + case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; + case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; + case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; + case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; + case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; case X86ISD::EXPAND: return "X86ISD::EXPAND"; case X86ISD::SELECT: return "X86ISD::SELECT"; + case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::ADDS: return "X86ISD::ADDS"; + case X86ISD::SUBS: return "X86ISD::SUBS"; } + return nullptr; } // isLegalAddressingMode - Return true if the addressing mode represented @@ -20236,6 +18335,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } + bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) @@ -20272,85 +18373,24 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (!VT.isSimple()) return false; - MVT SVT = VT.getSimpleVT(); + // Not for i1 vectors + if (VT.getScalarType() == MVT::i1) + return false; // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) return false; - // This is an experimental legality test that is tailored to match the - // legality test of the experimental lowering more closely. They are gated - // separately to ease testing of performance differences. - if (ExperimentalVectorShuffleLegality) - // We only care that the types being shuffled are legal. The lowering can - // handle any possible shuffle mask that results. - return isTypeLegal(SVT); - - // If this is a single-input shuffle with no 128 bit lane crossings we can - // lower it into pshufb. - if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || - (SVT.is256BitVector() && Subtarget->hasInt256())) { - bool isLegal = true; - for (unsigned I = 0, E = M.size(); I != E; ++I) { - if (M[I] >= (int)SVT.getVectorNumElements() || - ShuffleCrosses128bitLane(SVT, I, M[I])) { - isLegal = false; - break; - } - } - if (isLegal) - return true; - } - - // FIXME: blends, shifts. - return (SVT.getVectorNumElements() == 2 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isMOVLMask(M, SVT) || - isCommutedMOVLMask(M, SVT) || - isMOVHLPSMask(M, SVT) || - isSHUFPMask(M, SVT) || - isSHUFPMask(M, SVT, /* Commuted */ true) || - isPSHUFDMask(M, SVT) || - isPSHUFDMask(M, SVT, /* SecondOperand */ true) || - isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || - isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || - isPALIGNRMask(M, SVT, Subtarget) || - isUNPCKLMask(M, SVT, Subtarget->hasInt256()) || - isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || - isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || - (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); + // We only care that the types being shuffled are legal. The lowering can + // handle any possible shuffle mask that results. + return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const { - if (!VT.isSimple()) - return false; - - MVT SVT = VT.getSimpleVT(); - - // This is an experimental legality test that is tailored to match the - // legality test of the experimental lowering more closely. They are gated - // separately to ease testing of performance differences. - if (ExperimentalVectorShuffleLegality) - // The new vector shuffle lowering is very good at managing zero-inputs. - return isShuffleMaskLegal(Mask, VT); - - unsigned NumElts = SVT.getVectorNumElements(); - // FIXME: This collection of masks seems suspect. - if (NumElts == 2) - return true; - if (NumElts == 4 && SVT.is128BitVector()) { - return (isMOVLMask(Mask, SVT) || - isCommutedMOVLMask(Mask, SVT, true) || - isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true) || - isBlendMask(Mask, SVT, Subtarget->hasSSE41(), - Subtarget->hasInt256())); - } - return false; + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(Mask, VT); } //===----------------------------------------------------------------------===// @@ -20488,11 +18528,10 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, return BB; } -static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, - const TargetInstrInfo *TII, - const X86Subtarget* Subtarget) { +static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { DebugLoc dl = MI->getDebugLoc(); - + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; @@ -20514,9 +18553,8 @@ static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, } MachineBasicBlock * -X86TargetLowering::EmitVAARG64WithCustomInserter( - MachineInstr *MI, - MachineBasicBlock *MBB) const { +X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: @@ -20528,7 +18566,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( // 9 ) EFLAGS (implicit-def) assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); - assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + static_assert(X86::AddrNumOperands == 5, + "VAARG_64 assumes 5 address operands"); unsigned DestReg = MI->getOperand(0).getReg(); MachineOperand &Base = MI->getOperand(1); @@ -20546,7 +18585,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -20802,7 +18841,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -20885,7 +18924,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -20904,6 +18943,92 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); + + // We also lower double CMOVs: + // (CMOV (CMOV F, T, cc1), T, cc2) + // to two successives branches. For that, we look for another CMOV as the + // following instruction. + // + // Without this, we would add a PHI between the two jumps, which ends up + // creating a few copies all around. For instance, for + // + // (sitofp (zext (fcmp une))) + // + // we would generate: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // movaps %xmm0, %xmm1 + // jne .LBB5_2 + // xorps %xmm1, %xmm1 + // .LBB5_2: + // jp .LBB5_4 + // movaps %xmm1, %xmm0 + // .LBB5_4: + // retq + // + // because this custom-inserter would have generated: + // + // A + // | \ + // | B + // | / + // C + // | \ + // | D + // | / + // E + // + // A: X = ...; Y = ... + // B: empty + // C: Z = PHI [X, A], [Y, B] + // D: empty + // E: PHI [X, C], [Z, D] + // + // If we lower both CMOVs in a single step, we can instead generate: + // + // A + // | \ + // | C + // | /| + // |/ | + // | | + // | D + // | / + // E + // + // A: X = ...; Y = ... + // D: empty + // E: PHI [X, A], [X, C], [Y, D] + // + // Which, in our sitofp/fcmp example, gives us something like: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // jne .LBB5_4 + // jp .LBB5_4 + // xorps %xmm0, %xmm0 + // .LBB5_4: + // retq + // + MachineInstr *NextCMOV = nullptr; + MachineBasicBlock::iterator NextMIIt = + std::next(MachineBasicBlock::iterator(MI)); + if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) + NextCMOV = &*NextMIIt; + + MachineBasicBlock *jcc1MBB = nullptr; + + // If we have a double CMOV, we lower it to two successive branches to + // the same block. EFLAGS is used by both, so mark it as live in the second. + if (NextCMOV) { + jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, jcc1MBB); + jcc1MBB->addLiveIn(X86::EFLAGS); + } + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); @@ -20911,10 +19036,11 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo *TRI = - BB->getParent()->getSubtarget().getRegisterInfo(); - if (!MI->killsRegister(X86::EFLAGS) && - !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + + MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -20925,7 +19051,19 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - BB->addSuccessor(copy0MBB); + if (NextCMOV) { + // The fallthrough block may be jcc1MBB, if we have a double CMOV. + BB->addSuccessor(jcc1MBB); + + // In that case, jcc1MBB will itself fallthrough the copy0MBB, and + // jump to the sinkMBB. + jcc1MBB->addSuccessor(copy0MBB); + jcc1MBB->addSuccessor(sinkMBB); + } else { + BB->addSuccessor(copy0MBB); + } + + // The true block target of the first (or only) branch is always sinkMBB. BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. @@ -20933,6 +19071,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + if (NextCMOV) { + unsigned Opc2 = X86::GetCondBranchFromCond( + (X86::CondCode)NextCMOV->getOperand(3).getImm()); + BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); + } + // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB @@ -20941,10 +19085,22 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - BuildMI(*sinkMBB, sinkMBB->begin(), DL, - TII->get(X86::PHI), MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineInstrBuilder MIB = + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + // If we have a double CMOV, the second Jcc provides the same incoming + // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). + if (NextCMOV) { + MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); + // Copy the PHI result to the register defined by the second CMOV. + BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), + DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()); + NextCMOV->eraseFromParent(); + } MI->eraseFromParent(); // The pseudo instruction is gone now. return sinkMBB; @@ -20954,7 +19110,7 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); @@ -21027,10 +19183,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. - const uint32_t *RegMask = MF->getTarget() - .getSubtargetImpl() - ->getRegisterInfo() - ->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = + Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); @@ -21087,7 +19241,6 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(!Subtarget->isTargetMachO()); @@ -21106,8 +19259,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); @@ -21116,10 +19268,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. - const uint32_t *RegMask = F->getTarget() - .getSubtargetImpl() - ->getRegisterInfo() - ->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = + Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -21164,7 +19314,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -21271,8 +19421,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - MF->getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -21290,8 +19439,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { - const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>(); - const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + const bool Uses64BitFramePtr = + Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); X86FI->setRestoreBasePointer(MF); unsigned FramePtr = RegInfo->getFrameRegister(*MF); @@ -21314,7 +19463,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -21329,8 +19478,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( - MF->getSubtarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -21449,7 +19597,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, default: llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) @@ -21472,6 +19620,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: + case X86::TAILJMPd64_REX: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: @@ -21502,6 +19653,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); case X86::FP32_TO_INT16_IN_MEM: @@ -21514,7 +19669,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" @@ -21598,7 +19753,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -21611,16 +19766,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), - Subtarget); + return EmitMonitor(MI, BB, Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); + return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -21948,9 +20102,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // Note that even with AVX we prefer the PSHUFD form of shuffle for integer // vectors because it can have a load folded into it that UNPCK cannot. This // doesn't preclude something switching to the shorter encoding post-RA. - if (FloatDomain) { - if (Mask.equals(0, 0) || Mask.equals(1, 1)) { - bool Lo = Mask.equals(0, 0); + // + // FIXME: Should teach these routines about AVX vector widths. + if (FloatDomain && VT.getSizeInBits() == 128) { + if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { + bool Lo = Mask.equals({0, 0}); unsigned Shuffle; MVT ShuffleVT; // Check if we have SSE3 which will let us use MOVDDUP. That instruction @@ -21979,8 +20135,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, return true; } if (Subtarget->hasSSE3() && - (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { - bool Lo = Mask.equals(0, 0, 2, 2); + (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { + bool Lo = Mask.equals({0, 0, 2, 2}); unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -21993,8 +20149,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, /*AddTo*/ true); return true; } - if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { - bool Lo = Mask.equals(0, 0, 1, 1); + if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { + bool Lo = Mask.equals({0, 0, 1, 1}); unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; MVT ShuffleVT = MVT::v4f32; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -22012,12 +20168,12 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && - (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || - Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || - Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, - 15))) { + if (!FloatDomain && VT.getSizeInBits() == 128 && + (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || + Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals( + {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { bool Lo = Mask[0] == 0; unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; if (Depth == 1 && Root->getOpcode() == Shuffle) @@ -22053,9 +20209,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // in practice PSHUFB tends to be *very* fast so we're more aggressive. if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { SmallVector<SDValue, 16> PSHUFBMask; - assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); - int Ratio = 16 / Mask.size(); - for (unsigned i = 0; i < 16; ++i) { + int NumBytes = VT.getSizeInBits() / 8; + int Ratio = NumBytes / Mask.size(); + for (int i = 0; i < NumBytes; ++i) { if (Mask[i / Ratio] == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; @@ -22063,14 +20219,15 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, int M = Mask[i / Ratio] != SM_SentinelZero ? Ratio * Mask[i / Ratio] + i % Ratio : 255; - PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); + PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } - Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); + Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input); DCI.AddToWorklist(Op.getNode()); SDValue PSHUFBMaskOp = - DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); DCI.AddToWorklist(PSHUFBMaskOp.getNode()); - Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); DCI.AddToWorklist(Op.getNode()); DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), /*AddTo*/ true); @@ -22128,10 +20285,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) return false; // Bail if we hit a non-vector. - // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit - // version should be added. - if (VT.getSizeInBits() != 128) - return false; assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); @@ -22234,12 +20387,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); (void)HaveMask; assert(HaveMask); + // If we have more than 128-bits, only the low 128-bits of shuffle mask + // matter. Check that the upper masks are repeats and remove them. + if (VT.getSizeInBits() > 128) { + int LaneElts = 128 / VT.getScalarSizeInBits(); +#ifndef NDEBUG + for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) + for (int j = 0; j < LaneElts; ++j) + assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts && + "Mask doesn't repeat in high 128-bit lanes!"); +#endif + Mask.resize(LaneElts); + } + switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; @@ -22312,7 +20479,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) + if (V.getSimpleValueType().getScalarType() != MVT::i8 && + V.getSimpleValueType().getScalarType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -22357,7 +20525,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { @@ -22444,7 +20612,7 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), - getV4X86ShuffleImm8ForMask(Mask, DAG)); + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Check that the shuffles didn't cancel each other out. If not, we need to // combine to the new one. @@ -22486,8 +20654,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT == MVT::v8i16); - (void)VT; + assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -22495,17 +20662,18 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. - if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; - V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); + MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + V = DAG.getNode(ISD::BITCAST, DL, DVT, V); DCI.AddToWorklist(V.getNode()); - V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, - getV4X86ShuffleImm8ForMask(DMask, DAG)); + V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, + getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); DCI.AddToWorklist(V.getNode()); - return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); + return DAG.getNode(ISD::BITCAST, DL, VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. @@ -22533,18 +20701,14 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; - const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3}; - const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7}; - if (std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackLoMask)) || - std::equal(std::begin(MappedMask), std::end(MappedMask), - std::begin(UnpackHiMask))) { + if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || + makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. - V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0)); + V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0)); DCI.AddToWorklist(V.getNode()); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, - DL, MVT::v8i16, V, V); + DL, VT, V, V); } } } @@ -22602,9 +20766,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { // We're looking for blends between FADD and FSUB nodes. We insist on these // nodes being lined up in a specific expected pattern. - if (!(isShuffleEquivalent(Mask, 0, 3) || - isShuffleEquivalent(Mask, 0, 5, 2, 7) || - isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) return SDValue(); // Only specific types are legal at this point, assert so we notice if and @@ -22692,10 +20856,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } } - // Only handle 128 wide vector from here on. - if (!VT.is128BitVector()) - return SDValue(); - // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. @@ -22729,15 +20889,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformTruncateCombine - Converts truncate operation to -/// a sequence of vector shuffle operations. -/// It is possible when we truncate 256-bit vector to 128-bit vector -static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { - return SDValue(); -} - /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but @@ -22760,7 +20911,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + if (!BCVT.isVector() || + BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); } @@ -22788,7 +20940,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, : InVec.getOperand(1); // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = InVec.getNumOperands() > 1 && + InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. @@ -22833,6 +20986,25 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are +/// special and don't usually play with other vector types, it's better to +/// handle them early to be sure we emit efficient code by avoiding +/// store-load conversions. +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::x86mmx || + N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || + N->getOperand(0)->getValueType(0) != MVT::v2i32) + return SDValue(); + + SDValue V = N->getOperand(0); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), + N->getValueType(0), V.getOperand(0)); + + return SDValue(); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// into a somewhat faster sequence. For i686, the best sequence is apparently @@ -22845,16 +21017,43 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); + SDLoc dl(InputVector); + // Detect mmx to i32 conversion through a v2i32 elt extract. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + N->getValueType(0) == MVT::i32 && + InputVector.getValueType() == MVT::v2i32) { + + // The bitcast source is a direct mmx result. + SDValue MMXSrc = InputVector.getNode()->getOperand(0); + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); + + // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && + MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && + MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), + MMXSrcOp.getOperand(0)); + } - // Detect whether we are trying to convert from mmx to i32 and the bitcast - // from mmx to v2i32 has a single usage. - if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && - InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && - InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - InputVector.getNode()->getOperand(0)); + EVT VT = N->getValueType(0); + if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) && + InputVector.getOpcode() == ISD::BITCAST && + dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) { + uint64_t ExtractedElt = + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + uint64_t InputValue = + cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + uint64_t Res = (InputValue >> ExtractedElt) & 1; + return DAG.getConstant(Res, dl, MVT::i1); + } // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (InputVector.getValueType() != MVT::v4i32) @@ -22900,17 +21099,16 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // otherwise bounce the vector off the cache. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Vals[4]; - SDLoc dl(InputVector); if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, - DAG.getConstant(0, VecIdxTy)); + DAG.getConstant(0, dl, VecIdxTy)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, - DAG.getConstant(1, VecIdxTy)); + DAG.getConstant(1, dl, VecIdxTy)); - SDValue ShAmt = DAG.getConstant(32, + SDValue ShAmt = DAG.getConstant(32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, @@ -22930,7 +21128,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Replace each use (extract) with a load of the appropriate element. for (unsigned i = 0; i < 4; ++i) { uint64_t Offset = EltSize * i; - SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy()); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), StackPtr, OffsetVal); @@ -23013,16 +21211,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, default: break; case ISD::SETULT: case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMIN : 0; break; + Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; case ISD::SETUGT: case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMAX : 0; break; + Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; case ISD::SETLT: case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMIN : 0; break; + Opc = hasSigned ? X86ISD::SMIN : 0u; break; case ISD::SETGT: case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMAX : 0; break; + Opc = hasSigned ? X86ISD::SMAX : 0u; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && @@ -23031,16 +21229,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, default: break; case ISD::SETULT: case ISD::SETULE: - Opc = hasUnsigned ? X86ISD::UMAX : 0; break; + Opc = hasUnsigned ? X86ISD::UMAX : 0u; break; case ISD::SETUGT: case ISD::SETUGE: - Opc = hasUnsigned ? X86ISD::UMIN : 0; break; + Opc = hasUnsigned ? X86ISD::UMIN : 0u; break; case ISD::SETLT: case ISD::SETLE: - Opc = hasSigned ? X86ISD::SMAX : 0; break; + Opc = hasSigned ? X86ISD::SMAX : 0u; break; case ISD::SETGT: case ISD::SETGE: - Opc = hasSigned ? X86ISD::SMIN : 0; break; + Opc = hasSigned ? X86ISD::SMIN : 0u; break; } } @@ -23291,21 +21489,21 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TrueC->getAPIntValue().isPowerOf2()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, Cond.getValueType())); + DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, - DAG.getConstant(ShAmt, MVT::i8)); + DAG.getConstant(ShAmt, DL, MVT::i8)); } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, Cond.getValueType())); + DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, @@ -23340,7 +21538,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, Cond.getValueType())); + DAG.getConstant(1, DL, Cond.getValueType())); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), @@ -23348,7 +21546,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, - DAG.getConstant(Diff, Cond.getValueType())); + DAG.getConstant(Diff, DL, + Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) @@ -23436,7 +21635,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, (-OpRHSConst->getAPIntValue() - 1)) return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(-OpRHSConst->getAPIntValue(), VT)); + DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); // Another special case: If C was a sign bit, the sub has been // canonicalized into a xor. @@ -23450,7 +21649,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // don't rely on particular values of undef lanes. return DAG.getNode( X86ISD::SUBUS, DL, VT, OpLHS, - DAG.getConstant(OpRHSConst->getAPIntValue(), VT)); + DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); } } } @@ -23528,21 +21727,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // If we know that this node is legal then we know that it is going to be - // matched by one of the SSE/AVX BLEND instructions. These instructions only - // depend on the highest bit in each word. Try to use SimplifyDemandedBits - // to simplify previous instructions. + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize() && !VT.is512BitVector()) { + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + // If this is a *dynamic* select (non-constant condition) and we can match + // this node with one of the variable blend instructions, restructure the + // condition so that the blends can use the high bit of each element and use + // SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && - // We explicitly check against v8i16 and v16i16 because, although - // they're marked as Custom, they might only be legal when Cond is a - // build_vector of constants. This will be taken care in a later - // condition. - (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16) && - // Don't optimize vector of constants. Those are handled by - // the generic code and all the bits must be properly set for - // the generic optimizer. !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); @@ -23550,6 +21759,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (BitWidth == 1) return SDValue(); + // We can only handle the cases where VSELECT is directly legal on the + // subtarget. We custom lower VSELECT nodes with constant conditions and + // this makes it hard to see whether a dynamic VSELECT will correctly + // lower, so we both check the operation's status and explicitly handle the + // cases where a *dynamic* blend will fail even though a constant-condition + // blend could be custom lowered. + // FIXME: We should find a better way to handle this class of problems. + // Potentially, we should combine constant-condition vselect nodes + // pre-legalization into shuffles and not mark as many types as custom + // lowered. + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + // FIXME: We don't support i16-element blends currently. We could and + // should support them by making *all* the bits in the condition be set + // rather than just the high bit and using an i8-element blend. + if (VT.getScalarType() == MVT::i16) + return SDValue(); + // Dynamic blending was only available from SSE4.1 onward. + if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) + return SDValue(); + // Byte blends are only available in AVX2 + if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && + !Subtarget->hasAVX2()) + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -23598,25 +21832,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // We should generate an X86ISD::BLENDI from a vselect if its argument - // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of - // constants. This specific pattern gets generated when we split a - // selector for a 512 bit vector in a machine without AVX512 (but with - // 256-bit vectors), during legalization: - // - // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) - // - // Iff we find this pattern and the build_vectors are built from - // constants, we translate the vselect into a shuffle_vector that we - // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if ((N->getOpcode() == ISD::VSELECT || - N->getOpcode() == X86ISD::SHRUNKBLEND) && - !DCI.isBeforeLegalize()) { - SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); - if (Shuffle.getNode()) - return Shuffle; - } - return SDValue(); } @@ -23752,6 +21967,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { return SDValue(); } +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + if (Cond->getOpcode() == X86ISD::CMP) { + ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); + if (!CondOp1C || !CondOp1C->isNullValue()) + return false; + + Cond = Cond->getOperand(0); + } + + isAnd = false; + + SDValue SetCC0, SetCC1; + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -23785,7 +22043,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // Extra check as FCMOV only supports a subset of X86 cond. (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, - DAG.getConstant(CC, MVT::i8), Flags }; + DAG.getConstant(CC, DL, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); } @@ -23807,14 +22065,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, MVT::i8), Cond); + DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, - DAG.getConstant(ShAmt, MVT::i8)); + DAG.getConstant(ShAmt, DL, MVT::i8)); if (N->getNumValues() == 2) // Dead flag value? return DCI.CombineTo(N, Cond, SDValue()); return Cond; @@ -23824,7 +22082,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, MVT::i8), Cond); + DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, @@ -23862,14 +22120,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, - DAG.getConstant(CC, MVT::i8), Cond); + DAG.getConstant(CC, DL, MVT::i8), Cond); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, - DAG.getConstant(Diff, Cond.getValueType())); + DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) @@ -23915,12 +22173,50 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), - DAG.getConstant(CC, MVT::i8), Cond }; + DAG.getConstant(CC, DL, MVT::i8), Cond }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); } } } + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + return SDValue(); } @@ -23931,24 +22227,16 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, default: return SDValue(); // SSE/AVX/AVX2 blend intrinsics. case Intrinsic::x86_avx2_pblendvb: - case Intrinsic::x86_avx2_pblendw: - case Intrinsic::x86_avx2_pblendd_128: - case Intrinsic::x86_avx2_pblendd_256: // Don't try to simplify this intrinsic if we don't have AVX2. if (!Subtarget->hasAVX2()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_avx_blend_pd_256: - case Intrinsic::x86_avx_blend_ps_256: case Intrinsic::x86_avx_blendv_pd_256: case Intrinsic::x86_avx_blendv_ps_256: // Don't try to simplify this intrinsic if we don't have AVX. if (!Subtarget->hasAVX()) return SDValue(); // FALL-THROUGH - case Intrinsic::x86_sse41_pblendw: - case Intrinsic::x86_sse41_blendpd: - case Intrinsic::x86_sse41_blendps: case Intrinsic::x86_sse41_blendvps: case Intrinsic::x86_sse41_blendvpd: case Intrinsic::x86_sse41_pblendvb: { @@ -24020,8 +22308,9 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, // Replace this packed shift intrinsic with a target independent // shift dag node. - SDValue Splat = DAG.getConstant(C, VT); - return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat); + SDLoc DL(N); + SDValue Splat = DAG.getConstant(C, DL, VT); + return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); } } } @@ -24035,7 +22324,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - if (VT != MVT::i64) + if (VT != MVT::i64 && VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); @@ -24071,17 +22360,17 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); + DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), - DAG.getConstant(MulAmt1, VT)); + DAG.getConstant(MulAmt1, DL, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, - DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); + DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, - DAG.getConstant(MulAmt2, VT)); + DAG.getConstant(MulAmt2, DL, VT)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); @@ -24108,9 +22397,11 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); APInt ShAmt = N1C->getAPIntValue(); Mask = Mask.shl(ShAmt); - if (Mask != 0) - return DAG.getNode(ISD::AND, SDLoc(N), VT, - N00, DAG.getConstant(Mask, VT)); + if (Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, + N00, DAG.getConstant(Mask, DL, VT)); + } } } @@ -24240,7 +22531,8 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget->hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, - CMP01, DAG.getConstant(x86cc, MVT::i8)); + CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); if (N->getValueType(0) != MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), FSetCC); @@ -24248,7 +22540,8 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, - DAG.getConstant(x86cc, MVT::i8)); + DAG.getConstant(x86cc, DL, + MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; @@ -24264,14 +22557,16 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, - Vector32, DAG.getIntPtrConstant(0)); + Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } - SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, + OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, - DAG.getConstant(1, IntVT)); - SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + DAG.getConstant(1, DL, IntVT)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + ANDed); return OneBitOfTruth; } } @@ -24383,7 +22678,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, APInt Mask = APInt::getAllOnesValue(InBits); Mask = Mask.zext(VT.getScalarType().getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, - Op, DAG.getConstant(Mask, VT)); + Op, DAG.getConstant(Mask, DL, VT)); } case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, @@ -24393,24 +22688,116 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, } } +static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // A vector zext_in_reg may be represented as a shuffle, + // feeding into a bitcast (this represents anyext) feeding into + // an and with a mask. + // We'd like to try to combine that into a shuffle with zero + // plus a bitcast, removing the and. + if (N0.getOpcode() != ISD::BITCAST || + N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + // The other side of the AND should be a splat of 2^C, where C + // is the number of bits in the source type. + if (N1.getOpcode() == ISD::BITCAST) + N1 = N1.getOperand(0); + if (N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); + + ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); + EVT SrcType = Shuffle->getValueType(0); + + // We expect a single-source shuffle + if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) + return SDValue(); + + unsigned SrcSize = SrcType.getScalarSizeInBits(); + + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!Vector->isConstantSplat(SplatValue, SplatUndef, + SplatBitSize, HasAnyUndefs)) + return SDValue(); + + unsigned ResSize = N1.getValueType().getScalarSizeInBits(); + // Make sure the splat matches the mask we expect + if (SplatBitSize > ResSize || + (SplatValue + 1).exactLogBase2() != (int)SrcSize) + return SDValue(); + + // Make sure the input and output size make sense + if (SrcSize >= ResSize || ResSize % SrcSize) + return SDValue(); + + // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> + // The number of u's between each two values depends on the ratio between + // the source and dest type. + unsigned ZextRatio = ResSize / SrcSize; + bool IsZext = true; + for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { + if (i % ZextRatio) { + if (Shuffle->getMaskElt(i) > 0) { + // Expected undef + IsZext = false; + break; + } + } else { + if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { + // Expected element number + IsZext = false; + break; + } + } + } + + if (!IsZext) + return SDValue(); + + // Ok, perform the transformation - replace the shuffle with + // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero + // (instead of undef) where the k elements come from the zero vector. + SmallVector<int, 8> Mask; + unsigned NumElems = SrcType.getVectorNumElements(); + for (unsigned i = 0; i < NumElems; ++i) + if (i % ZextRatio) + Mask.push_back(NumElems); + else + Mask.push_back(i / ZextRatio); + + SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, + Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); + return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) + return Zext; + + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + // Create BEXTR instructions // BEXTR is ((X >> imm) & (2**size-1)) if (VT == MVT::i32 || VT == MVT::i64) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check for BEXTR. if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { @@ -24420,10 +22807,11 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, uint64_t Mask = MaskNode->getZExtValue(); uint64_t Shift = ShiftNode->getZExtValue(); if (isMask_64(Mask)) { - uint64_t MaskSize = CountPopulation_64(Mask); + uint64_t MaskSize = countPopulation(Mask); if (Shift + MaskSize <= VT.getSizeInBits()) return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), - DAG.getConstant(Shift | (MaskSize << 8), VT)); + DAG.getConstant(Shift | (MaskSize << 8), DL, + VT)); } } } // BEXTR @@ -24438,10 +22826,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (VT != MVT::v2i64 && VT != MVT::v4i64) return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) @@ -24553,8 +22937,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + bool OptForSize = + MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -24642,10 +23026,10 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { // Generate SUB & CMOV. SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), - DAG.getConstant(0, VT), N0.getOperand(0)); + DAG.getConstant(0, DL, VT), N0.getOperand(0)); SDValue Ops[] = { N0.getOperand(0), Neg, - DAG.getConstant(X86::COND_GE, MVT::i8), + DAG.getConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1) }; return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); } @@ -24690,7 +23074,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy()); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems/2); @@ -24725,7 +23109,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mld->getValueType(0); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); EVT LdVT = Mld->getMemoryVT(); SDLoc dl(Mld); @@ -24753,7 +23136,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); + assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) + && "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); } @@ -24769,7 +23153,7 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, WideVecVT), + DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); } else { @@ -24781,14 +23165,14 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector<SDValue, 16> Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), @@ -24805,7 +23189,6 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); @@ -24819,7 +23202,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, "Unexpected size for truncating masked store"); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. - assert (((NumElems * FromSz) % ToSz) == 0 && + assert (((NumElems * FromSz) % ToSz) == 0 && "Unexpected ratio for truncating masked store"); unsigned SizeRatio = FromSz / ToSz; @@ -24837,7 +23220,8 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); + assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) + && "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, DAG.getUNDEF(WideVecVT), @@ -24853,7 +23237,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) ShuffleVec[i] = NumElems*SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, WideVecVT), + DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); } else { @@ -24865,7 +23249,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned NumConcat = WidenNumElts / MaskNumElts; SmallVector<SDValue, 16> Ops(NumConcat); - SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); Ops[0] = Mask; for (unsigned i = 1; i != NumConcat; ++i) Ops[i] = ZeroVal; @@ -24899,7 +23283,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); - SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy()); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); @@ -24972,7 +23356,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl, TLI.getPointerTy()); SDValue Ptr = St->getBasePtr(); @@ -24980,7 +23364,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, - DAG.getIntPtrConstant(i)); + DAG.getIntPtrConstant(i, dl)); SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), St->getAlignment()); @@ -25001,10 +23385,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps - && Subtarget->hasSSE2(); + bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); + bool F64IsLegal = + !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa<LoadSDNode>(St->getValue()) && @@ -25065,7 +23448,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Otherwise, lower to two pairs of 32-bit loads / stores. SDValue LoAddr = Ld->getBasePtr(); SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); + DAG.getConstant(4, LdDL, MVT::i32)); SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), @@ -25086,7 +23469,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, LoAddr = St->getBasePtr(); HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, - DAG.getConstant(4, MVT::i32)); + DAG.getConstant(4, StDL, MVT::i32)); SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(), @@ -25099,6 +23482,27 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, MinAlign(St->getAlignment(), 4)); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } + + // This is similar to the above case, but here we handle a scalar 64-bit + // integer store that is extracted from a vector on a 32-bit target. + // If we have SSE2, then we can treat it like a floating-point double + // to get past legalization. The execution dependencies fixup pass will + // choose the optimal machine instruction for the store if this really is + // an integer or v2f32 rather than an f64. + if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() && + St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue OldExtract = St->getOperand(1); + SDValue ExtOp0 = OldExtract.getOperand(0); + unsigned VecSize = ExtOp0.getValueSizeInBits(); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); + SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtOp0); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + BitCast, OldExtract.getOperand(1)); + return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + } + return SDValue(); } @@ -25197,7 +23601,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - CommuteVectorShuffleMask(RMask, NumElts); + ShuffleVectorSDNode::commuteMask(RMask); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask @@ -25261,11 +23665,13 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + // F[X]OR(0.0, x) -> x - // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // F[X]OR(x, 0.0) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); @@ -25296,26 +23702,30 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 - // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); + + // FAND(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } /// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { - // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + + // FANDN(x, 0.0) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(1); + return SDValue(); } @@ -25391,23 +23801,76 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT InVT = N0->getValueType(0); + EVT InSVT = InVT.getScalarType(); + SDLoc DL(N); // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) // This exposes the sext to the sdivrem lowering, so that it directly extends // from AH (which we otherwise need to do contortions to access). if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - N0.getValueType() == MVT::i8 && VT == MVT::i32) { - SDLoc dl(N); + InVT == MVT::i8 && VT == MVT::i32) { SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); return R.getValue(1); } - if (!DCI.isBeforeLegalizeOps()) + if (!DCI.isBeforeLegalizeOps()) { + if (N0.getValueType() == MVT::i1) { + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue AllOnes = + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + } return SDValue(); + } + + if (VT.isVector()) { + auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) { + EVT InVT = N->getValueType(0); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), + 128 / InVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(), + DAG.getUNDEF(InVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + }; + + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG + // which ensures lowering to X86ISD::VSEXT (pmovsx*). + if (VT.getSizeInBits() == 128 && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + SDValue ExOp = ExtendToVec128(DL, N0); + return DAG.getSignExtendVectorInReg(ExOp, DL, VT); + } + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::SIGN_EXTEND_VECTOR_INREG. + if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned NumVecs = VT.getSizeInBits() / 128; + unsigned NumSubElts = 128 / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; + ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendToVec128(DL, SrcVec); + SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + } + } if (!Subtarget->hasFp256()) return SDValue(); @@ -25483,7 +23946,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), - DAG.getConstant(1, VT)); + DAG.getConstant(1, dl, VT)); } } @@ -25495,7 +23958,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N00.getOperand(0), N00.getOperand(1)), - DAG.getConstant(1, VT)); + DAG.getConstant(1, dl, VT)); } } if (VT.is256BitVector()) { @@ -25534,18 +23997,18 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS, + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, LHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, - DAG.getConstant(0, addV.getValueType()), CC); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS, + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, RHS.getOperand(1)); - return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV, - DAG.getConstant(0, addV.getValueType()), CC); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); } if (VT.getScalarType() == MVT::i1 && @@ -25569,12 +24032,12 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); if (CC == ISD::SETGT) - return DAG.getConstant(0, VT); + return DAG.getConstant(0, DL, VT); if (CC == ISD::SETLE) - return DAG.getConstant(1, VT); + return DAG.getConstant(1, DL, VT); if (CC == ISD::SETEQ || CC == ISD::SETGE) return DAG.getNOT(DL, LHS.getOperand(0), VT); - + assert((CC == ISD::SETNE || CC == ISD::SETLT) && "Unexpected condition code!"); return LHS.getOperand(0); @@ -25584,6 +24047,24 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), dl, + Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); @@ -25595,20 +24076,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, if (MayFoldLoad(Ld)) { // Extract the countS bits from the immediate so we can get the proper // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, interps doesn't use + // When the second source op is a memory address, insertps doesn't use // countS and just gets an f32 from that address. unsigned DestIndex = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); - } else - return SDValue(); - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); + } + return SDValue(); +} + +static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); } // Helper function of PerformSETCCCombine. It is to materialize "setb reg" @@ -25619,12 +24127,14 @@ static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, if (VT == MVT::i8) return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), - DAG.getConstant(1, VT)); + DAG.getConstant(X86::COND_B, DL, MVT::i8), + EFLAGS), + DAG.getConstant(1, DL, VT)); assert (VT == MVT::i1 && "Unexpected type for SECCC node"); return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS)); + DAG.getConstant(X86::COND_B, DL, MVT::i8), + EFLAGS)); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT @@ -25663,7 +24173,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); + SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } @@ -25685,7 +24195,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { - SDValue Cond = DAG.getConstant(CC, MVT::i8); + SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, Flags); } @@ -25740,7 +24250,7 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, } static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, - const X86TargetLowering *XTLI) { + const X86Subtarget *Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); @@ -25764,12 +24274,16 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, if (Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); EVT VT = Ld->getValueType(0); + + // This transformation is not supported if the result type is f16 + if (N->getValueType(0) == MVT::f16) + return SDValue(); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !XTLI->getSubtarget()->is64Bit() && - VT == MVT::i64) { - SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), - Ld->getChain(), Op0, DAG); + !Subtarget->is64Bit() && VT == MVT::i64) { + SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( + SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); return FILDChain; } @@ -25790,12 +24304,13 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); - SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); + SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B,MVT::i8), + DAG.getConstant(X86::COND_B, DL, + MVT::i8), N->getOperand(2)), - DAG.getConstant(1, VT)); + DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } @@ -25830,16 +24345,17 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { SDValue CmpOp0 = Cmp.getOperand(0); SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, - DAG.getConstant(1, CmpOp0.getValueType())); + DAG.getConstant(1, DL, CmpOp0.getValueType())); SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); if (CC == X86::COND_NE) return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); + DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), + NewCmp); return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(0, OtherVal.getValueType()), NewCmp); + DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } /// PerformADDCombine - Do target-specific dag combines on integer adds. @@ -25875,9 +24391,9 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), - DAG.getConstant(~XorC, VT)); + DAG.getConstant(~XorC, SDLoc(Op1), VT)); return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, - DAG.getConstant(C->getAPIntValue()+1, VT)); + DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); } } @@ -25947,7 +24463,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), OrigVT.getVectorNumElements() / Ratio); OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0, DL)); } Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); @@ -25968,6 +24484,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -25983,7 +24500,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); - case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -25999,7 +24516,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); - case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); @@ -26022,9 +24538,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: - return PerformINSERTPSCombine(N, DAG, Subtarget); - case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) + return PerformINSERTPSCombine(N, DAG, Subtarget); + break; + } + case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); } return SDValue(); @@ -26131,27 +24650,23 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -namespace { - // Helper to match a string separated by whitespace. - bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { - s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. +// Helper to match a string separated by whitespace. +static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { + S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. - for (unsigned i = 0, e = args.size(); i != e; ++i) { - StringRef piece(*args[i]); - if (!s.startswith(piece)) // Check if the piece matches. - return false; - - s = s.substr(piece.size()); - StringRef::size_type pos = s.find_first_not_of(" \t"); - if (pos == 0) // We matched a prefix. - return false; + for (StringRef Piece : Pieces) { + if (!S.startswith(Piece)) // Check if the piece matches. + return false; - s = s.substr(pos); - } + S = S.substr(Piece.size()); + StringRef::size_type Pos = S.find_first_not_of(" \t"); + if (Pos == 0) // We matched a prefix. + return false; - return s.empty(); + S = S.substr(Pos); } - const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; + + return S.empty(); } static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { @@ -26191,12 +24706,12 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 - if (matchAsm(AsmPieces[0], "bswap", "$0") || - matchAsm(AsmPieces[0], "bswapl", "$0") || - matchAsm(AsmPieces[0], "bswapq", "$0") || - matchAsm(AsmPieces[0], "bswap", "${0:q}") || - matchAsm(AsmPieces[0], "bswapl", "${0:q}") || - matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { + if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || + matchAsm(AsmPieces[0], {"bswapl", "$0"}) || + matchAsm(AsmPieces[0], {"bswapq", "$0"}) || + matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); @@ -26205,8 +24720,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || - matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { + (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || + matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -26218,9 +24733,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && - matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && - matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && - matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && + matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && + matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -26235,9 +24750,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - if (matchAsm(AsmPieces[0], "bswap", "%eax") && - matchAsm(AsmPieces[1], "bswap", "%edx") && - matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && + matchAsm(AsmPieces[1], {"bswap", "%edx"}) && + matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } @@ -26373,7 +24888,7 @@ TargetLowering::ConstraintWeight break; case 'G': case 'C': - if (dyn_cast<ConstantFP>(CallOperandVal)) { + if (isa<ConstantFP>(CallOperandVal)) { weight = CW_Constant; } break; @@ -26428,7 +24943,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'I': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 31) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26436,7 +24952,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'J': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 63) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26444,7 +24961,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'K': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (isInt<8>(C->getSExtValue())) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26453,7 +24971,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { - Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26461,7 +24980,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'M': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 3) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26469,7 +24989,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'N': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 255) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26477,7 +24998,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, case 'O': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 127) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26488,7 +25010,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); + Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain @@ -26501,7 +25023,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { - Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); break; } } @@ -26513,7 +25036,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, // Literal immediates are always ok. if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { // Widen to 64 bits here to get it sign extended. - Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); + Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); break; } @@ -26571,8 +25094,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -std::pair<unsigned, const TargetRegisterClass*> -X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, +std::pair<unsigned, const TargetRegisterClass *> +X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. @@ -26678,7 +25202,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair<unsigned, const TargetRegisterClass*> Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index ba077a8..b589ca4 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -26,41 +26,41 @@ namespace llvm { namespace X86ISD { // X86 Specific DAG Nodes - enum NodeType { + enum NodeType : unsigned { // Start the numbering where the builtin ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, - /// BSF - Bit scan forward. - /// BSR - Bit scan reverse. + /// Bit scan forward. BSF, + /// Bit scan reverse. BSR, - /// SHLD, SHRD - Double shift instructions. These correspond to + /// Double shift instructions. These correspond to /// X86::SHLDxx and X86::SHRDxx instructions. SHLD, SHRD, - /// FAND - Bitwise logical AND of floating point values. This corresponds + /// Bitwise logical AND of floating point values. This corresponds /// to X86::ANDPS or X86::ANDPD. FAND, - /// FOR - Bitwise logical OR of floating point values. This corresponds + /// Bitwise logical OR of floating point values. This corresponds /// to X86::ORPS or X86::ORPD. FOR, - /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// Bitwise logical XOR of floating point values. This corresponds /// to X86::XORPS or X86::XORPD. FXOR, - /// FANDN - Bitwise logical ANDNOT of floating point values. This + /// Bitwise logical ANDNOT of floating point values. This /// corresponds to X86::ANDNPS or X86::ANDNPD. FANDN, - /// FSRL - Bitwise logical right shift of floating point values. These + /// Bitwise logical right shift of floating point values. This /// corresponds to X86::PSRLDQ. FSRL, - /// CALL - These operations represent an abstract X86 call + /// These operations represent an abstract X86 call /// instruction, which includes a bunch of information. In particular the /// operands of these node are: /// @@ -79,8 +79,7 @@ namespace llvm { /// CALL, - /// RDTSC_DAG - This operation implements the lowering for - /// readcyclecounter + /// This operation implements the lowering for readcyclecounter RDTSC_DAG, /// X86 Read Time-Stamp Counter and Processor ID. @@ -131,178 +130,194 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, - /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, - /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + /// Repeat move, corresponds to X86::REP_MOVSx. REP_MOVS, - /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// On Darwin, this node represents the result of the popl /// at function entry, used for PIC code. GlobalBaseReg, - /// Wrapper - A wrapper node for TargetConstantPool, + /// A wrapper node for TargetConstantPool, /// TargetExternalSymbol, and TargetGlobalAddress. Wrapper, - /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// Special wrapper used under X86-64 PIC mode for RIP /// relative displacements. WrapperRIP, - /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector + /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. If you think this is too close to the previous /// mnemonic, so do I; blame Intel. MOVDQ2Q, - /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX + /// Copies a 32-bit value from the low word of a MMX /// vector to a GPR. MMX_MOVD2W, - /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + + /// Extract an 8-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRB. PEXTRB, - /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// Extract a 16-bit value from a vector and zero extend it to /// i32, corresponds to X86::PEXTRW. PEXTRW, - /// INSERTPS - Insert any element of a 4 x float vector into any element + /// Insert any element of a 4 x float vector into any element /// of a destination 4 x floatvector. INSERTPS, - /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// Insert the lower 8-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRB. PINSRB, - /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. PINSRW, MMX_PINSRW, - /// PSHUFB - Shuffle 16 8-bit values within a vector. + /// Shuffle 16 8-bit values within a vector. PSHUFB, - /// ANDNP - Bitwise Logical AND NOT of Packed FP values. + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, - /// PSIGN - Copy integer sign. + /// Copy integer sign. PSIGN, - /// BLENDI - Blend where the selector is an immediate. + /// Blend where the selector is an immediate. BLENDI, - /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// Blend where the condition has been shrunk. /// This is used to emphasize that the condition mask is /// no more valid for generic VSELECT optimizations. SHRUNKBLEND, - /// ADDSUB - Combined add and sub on an FP vector. + /// Combined add and sub on an FP vector. ADDSUB, - - // SUBUS - Integer sub with unsigned saturation. + // FP vector ops with rounding mode. + FADD_RND, + FSUB_RND, + FMUL_RND, + FDIV_RND, + FMAX_RND, + FMIN_RND, + + // Integer add/sub with unsigned saturation. + ADDUS, SUBUS, + // Integer add/sub with signed saturation. + ADDS, + SUBS, - /// HADD - Integer horizontal add. + /// Integer horizontal add. HADD, - /// HSUB - Integer horizontal sub. + /// Integer horizontal sub. HSUB, - /// FHADD - Floating point horizontal add. + /// Floating point horizontal add. FHADD, - /// FHSUB - Floating point horizontal sub. + /// Floating point horizontal sub. FHSUB, - /// UMAX, UMIN - Unsigned integer max and min. + /// Unsigned integer max and min. UMAX, UMIN, - /// SMAX, SMIN - Signed integer max and min. + /// Signed integer max and min. SMAX, SMIN, - /// FMAX, FMIN - Floating point max and min. - /// + /// Floating point max and min. FMAX, FMIN, - /// FMAXC, FMINC - Commutative FMIN and FMAX. + /// Commutative FMIN and FMAX. FMAXC, FMINC, - /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal - /// approximation. Note that these typically require refinement + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, - // TLSADDR - Thread Local Storage. + // Thread Local Storage. TLSADDR, - // TLSBASEADDR - Thread Local Storage. A call to get the start address + // Thread Local Storage. A call to get the start address // of the TLS block for the current module. TLSBASEADDR, - // TLSCALL - Thread Local Storage. When calling to an OS provided + // Thread Local Storage. When calling to an OS provided // thunk at the address from an earlier relocation. TLSCALL, - // EH_RETURN - Exception Handling helpers. + // Exception Handling helpers. EH_RETURN, - // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + // SjLj exception handling setjmp. EH_SJLJ_SETJMP, - // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + // SjLj exception handling longjmp. EH_SJLJ_LONGJMP, - /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for + /// Tail call return. See X86TargetLowering::LowerCall for /// the list of operands. TC_RETURN, - // VZEXT_MOVL - Vector move to low scalar and zero higher vector elements. + // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // VZEXT - Vector integer zero-extend. + // Vector integer zero-extend. VZEXT, - // VSEXT - Vector integer signed-extend. + // Vector integer signed-extend. VSEXT, - // VTRUNC - Vector integer truncate. + // Vector integer truncate. VTRUNC, - // VTRUNC - Vector integer truncate with mask. + // Vector integer truncate with mask. VTRUNCM, - // VFPEXT - Vector FP extend. + // Vector FP extend. VFPEXT, - // VFPROUND - Vector FP round. + // Vector FP round. VFPROUND, - // VSHL, VSRL - 128-bit vector logical left / right shift + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, - // VSHL, VSRL, VSRA - Vector shift elements + // Vector shift elements VSHL, VSRL, VSRA, - // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate + // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, - // CMPP - Vector packed double/float comparison. + // Vector packed double/float comparison. CMPP, - // PCMP* - Vector integer comparisons. + // Vector integer comparisons. PCMPEQ, PCMPGT, - // PCMP*M - Vector integer comparisons, the result is in a mask vector. + // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, - /// CMPM, CMPMU - Vector comparison generating mask bits for fp and + /// Vector comparison generating mask bits for fp and /// integer signed and unsigned data types. CMPM, CMPMU, + // Vector comparison with rounding mode for FP values + CMPM_RND, - // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + // Arithmetic operations with FLAGS results. ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - BEXTR, // BEXTR - Bit field extract + BEXTR, // Bit field extract UMUL, // LOW, HI, FLAGS = umul LHS, RHS @@ -313,16 +328,16 @@ namespace llvm { UDIVREM8_ZEXT_HREG, SDIVREM8_SEXT_HREG, - // MUL_IMM - X86 specific multiply by immediate. + // X86-specific multiply by immediate. MUL_IMM, - // PTEST - Vector bitwise comparisons. + // Vector bitwise comparisons. PTEST, - // TESTP - Vector packed fp sign bitwise comparisons. + // Vector packed fp sign bitwise comparisons. TESTP, - // TESTM, TESTNM - Vector "test" in AVX-512, the result is in a mask vector. + // Vector "test" in AVX-512, the result is in a mask vector. TESTM, TESTNM, @@ -359,9 +374,10 @@ namespace llvm { VPERMIV3, VPERMI, VPERM2X128, + // Broadcast scalar to vector VBROADCAST, - // masked broadcast - VBROADCASTM, + // Broadcast subvector to vector + SUBV_BROADCAST, // Insert/Extract vector element VINSERT, VEXTRACT, @@ -378,6 +394,14 @@ namespace llvm { FNMSUB, FMADDSUB, FMSUBADD, + // FMA with rounding mode + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + RNDSCALE, // Compress and expand COMPRESS, @@ -547,9 +571,11 @@ namespace llvm { // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { public: - explicit X86TargetLowering(const X86TargetMachine &TM); + explicit X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI); unsigned getJumpTableEncoding() const override; + bool useSoftFloat() const override; MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; } @@ -679,13 +705,27 @@ namespace llvm { std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; + unsigned getInlineAsmMemConstraint( + const std::string &ConstraintCode) const override { + if (ConstraintCode == "i") + return InlineAsm::Constraint_i; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode == "v") + return InlineAsm::Constraint_v; + else if (ConstraintCode == "X") + return InlineAsm::Constraint_X; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On /// error, this returns a register number of 0. - std::pair<unsigned, const TargetRegisterClass*> - getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, + MVT VT) const override; /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. @@ -732,6 +772,10 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + bool isVectorLoadExtDesirable(SDValue) const override; + /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. @@ -775,10 +819,6 @@ namespace llvm { bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; - const X86Subtarget* getSubtarget() const { - return Subtarget; - } - /// Return true if the specified scalar FP type is computed in an SSE /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { @@ -827,16 +867,14 @@ namespace llvm { bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; - /// \brief Reset the operation actions based on target options. - void resetOperationActions() override; - bool useLoadStackGuardNode() const override; /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; protected: - std::pair<const TargetRegisterClass*, uint8_t> - findRepresentativeClass(MVT VT) const override; + std::pair<const TargetRegisterClass *, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const override; private: /// Keep a pointer to the X86Subtarget around so that we can @@ -844,10 +882,6 @@ namespace llvm { const X86Subtarget *Subtarget; const DataLayout *TD; - /// Used to store the TargetOptions so that we don't waste time resetting - /// the operation actions unless we have to. - TargetOptions TO; - /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. @@ -947,8 +981,9 @@ namespace llvm { SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFormalArguments(SDValue Chain, @@ -981,7 +1016,8 @@ namespace llvm { bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + TargetLoweringBase::AtomicRMWExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; @@ -1055,6 +1091,9 @@ namespace llvm { /// Use rcp* to speed up fdiv calculations. SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; + + /// Reassociate floating point divisions into multiply by reciprocal. + bool combineRepeatedFPDivisors(unsigned NumUsers) const override; }; namespace X86 { diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index c4ec3df..9d11d3c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1,3 +1,18 @@ +//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 AVX512 instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + // Group template arguments that can be derived from the vector type (EltNum x // EltVT). These are things like the register class for the writemask, etc. // The idea is to pass one of these as the template argument rather than the @@ -59,17 +74,16 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", VTName)), VTName)); - PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); - // Load patterns used for memory operands. We only have this defined in - // case of i64 element types for sub-512 integer vectors. For now, keep - // MemOpFrag undefined in these cases. - PatFrag MemOpFrag = - !if (!eq (NumElts#EltTypeName, "1f32"), !cast<PatFrag>("memopfsf32"), - !if (!eq (NumElts#EltTypeName, "1f64"), !cast<PatFrag>("memopfsf64"), - !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName), - !if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName), - !if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?))))); + PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), + !if (!eq (EltSize, 64), "v8i64", "v16i32"), + VTName))), VTName)); + + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); // The corresponding float type, e.g. v16f32 for v16i32 // Note: For EltSize < 32, FloatVT is illegal and TableGen @@ -96,10 +110,15 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (EltTypeName, "f64"), SSEPackedDouble, SSEPackedInt)); + RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + // A vector type of the same width with element type i32. This is used to // create the canonical constant zero node ImmAllZerosV. ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + + string ZSuffix = !if (!eq (Size, 128), "Z128", + !if (!eq (Size, 256), "Z256", "Z")); } def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; @@ -161,21 +180,20 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, list<dag> Pattern, list<dag> MaskingPattern, list<dag> ZeroMaskingPattern, - string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, - OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# - "$dst "#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst , "#IntelSrcAsm#"}", Pattern, itin>; // Prefer over VMOV*rrk Pat<> let AddedComplexity = 20 in def NAME#k: AVX512<O, F, Outs, MaskingIns, - OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"# - "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", MaskingPattern, itin>, EVEX_K { // In case of the 3src subclass this is overridden with a let. @@ -183,8 +201,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, } let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, - OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"# - "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}", + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, "#IntelSrcAsm#"}", ZeroMaskingPattern, itin>, EVEX_KZ; @@ -198,7 +216,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - SDNode Select = vselect, string Round = "", + SDNode Select = vselect, string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : @@ -208,7 +226,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - Round, MaskingConstraint, NoItinerary, IsCommutable>; + MaskingConstraint, NoItinerary, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -216,7 +234,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", + dag RHS, InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, Ins, @@ -224,14 +242,14 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, - Round, "$src0 = $dst", itin, IsCommutable>; + "$src0 = $dst", itin, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", + dag RHS, InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common<O, F, _, Outs, Ins, @@ -239,7 +257,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select, - Round, "$src0 = $dst", itin, IsCommutable>; + "$src0 = $dst", itin, IsCommutable>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -265,9 +283,65 @@ multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, AVX512_maskable_custom<O, F, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), - OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "", + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "$src0 = $dst">; + +// Instruction with mask that puts result in mask register, +// like "compare" and "vptest" +multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + string Round = "", + InstrItinClass itin = NoItinerary> { + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# + "$dst "#Round#", "#IntelSrcAsm#"}", + Pattern, itin>; + + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#Round#"}", + MaskingPattern, itin>, EVEX_K; +} + +multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + string Round = "", + InstrItinClass itin = NoItinerary> : + AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.KRC:$dst, RHS)], + [(set _.KRC:$dst, MaskingRHS)], + Round, NoItinerary>; + +multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary> : + AVX512_maskable_common_cmp<O, F, _, Outs, Ins, + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (and _.KRCWM:$mask, RHS), + Round, itin>; + +multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm> : + AVX512_maskable_custom_cmp<O, F, Outs, + Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [],[],"", NoItinerary>; + // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion let Predicates = [HasAVX512] in { @@ -394,7 +468,7 @@ multiclass vinsert_for_size_no_alt<int Opcode, SDNodeXForm INSERT_get_vinsert_imm> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, From.RC:$src2, i8imm:$src3), + (ins VR512:$src1, From.RC:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts # "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}", @@ -405,7 +479,7 @@ multiclass vinsert_for_size_no_alt<int Opcode, let mayLoad = 1 in def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3), + (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts # "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}", @@ -467,12 +541,12 @@ defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -489,7 +563,7 @@ multiclass vextract_for_size<int Opcode, SDNodeXForm EXTRACT_get_vextract_imm> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins VR512:$src1, i8imm:$idx), + (ins VR512:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x4", "$idx, $src1", "$src1, $idx", [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), @@ -497,7 +571,7 @@ multiclass vextract_for_size<int Opcode, AVX512AIi8Base, EVEX, EVEX_V512; let mayStore = 1 in def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2), + (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2), "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" "$dst, $src1, $src2}", []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; @@ -596,13 +670,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, i32i8imm:$src2), + (ins VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -735,12 +809,8 @@ def : Pat <(v8i64 (X86vzext VK8WM:$mask)), def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), (VPBROADCASTDrZr GR32:$src)>; -def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), (VPBROADCASTQrZr GR64:$src)>; -def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), (VPBROADCASTDrZr GR32:$src)>; @@ -762,24 +832,33 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; - def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, + def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, + VR128X:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"), + []>, EVEX, EVEX_K; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, VR128X:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, - EVEX, EVEX_KZ; + []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; - def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, + def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, + x86memop:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"), + []>, EVEX, EVEX_K; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, - (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; + [(set DstRC:$dst, (OpVT (vselect KRC:$mask, + (X86VBroadcast (ld_frag addr:$src)), + (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ; } } @@ -790,28 +869,71 @@ defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, - RegisterClass KRC> { +multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), + def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; - def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, - x86memop:$src), + [(set _Dst.RC:$dst, + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX; + def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, + _Src.MemOp:$src), !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + []>, EVEX, EVEX_K; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, + _Src.MemOp:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; } } -defm VBROADCASTI32X4 : avx512_int_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", - i128mem, loadv2i64, VK16WM>, +defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v16i32_info, v4i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT4>; -defm VBROADCASTI64X4 : avx512_int_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", - i256mem, loadv4i64, VK16WM>, VEX_W, +defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v16f32_info, v4f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + v8i64_info, v4i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; +defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", + v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; +let Predicates = [HasVLX] in { +defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v8i32x_info, v4i32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v8f32x_info, v4f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; +} +let Predicates = [HasVLX, HasDQI] in { +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", + v4i64x_info, v2i64x_info>, VEX_W, + EVEX_V256, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", + v4f64x_info, v2f64x_info>, VEX_W, + EVEX_V256, EVEX_CD8<64, CD8VT2>; +} +let Predicates = [HasDQI] in { +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", + v8i64_info, v2i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8", + v16i32_info, v8i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", + v8f64_info, v2f64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", + v16f32_info, v8f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +} + def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), (VPBROADCASTDZrr VR128X:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), @@ -819,13 +941,23 @@ def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>; + def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>; + def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>; def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), (VBROADCASTSSZr VR128X:$src)>; @@ -840,12 +972,6 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; -let Predicates = [HasAVX512] in { -def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG - (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), - addr:$src)), sub_ymm)>; -} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- @@ -882,18 +1008,18 @@ multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, i8imm:$src2), + (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.MemOp:$src1, i8imm:$src2), + (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, - (_.VT (OpNode (_.MemOpFrag addr:$src1), + (_.VT (OpNode (_.LdFrag addr:$src1), (i8 imm:$src2))))]>, EVEX, EVEX_CD8<_.EltSize, CD8VF>; } @@ -917,7 +1043,7 @@ multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>, + (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>, EVEX_4V; } } @@ -957,15 +1083,15 @@ multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, EVEX_4V; } -defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, +defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, loadv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, loadv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in -defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, +defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, loadv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, loadv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -1040,16 +1166,16 @@ let Constraints = "$src1 = $dst" in { EVEX_4V, EVEX_KZ; } } -defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, memopv16i32, +defm VPERMI2D : avx512_perm_3src<0x76, "vpermi2d", VR512, loadv16i32, i512mem, X86VPermiv3, v16i32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, memopv8i64, +defm VPERMI2Q : avx512_perm_3src<0x76, "vpermi2q", VR512, loadv8i64, i512mem, X86VPermiv3, v8i64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, memopv16f32, +defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps", VR512, loadv16f32, i512mem, X86VPermiv3, v16f32, VK16WM>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, memopv8f64, +defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd", VR512, loadv8f64, i512mem, X86VPermiv3, v8f64, VK8WM>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1069,16 +1195,16 @@ multiclass avx512_perm_table_3src<bits<8> opc, string Suffix, RegisterClass RC, (MaskVT (COPY_TO_REGCLASS MRC:$mask, KRC)), VR512:$idx, VR512:$src2)>; } -defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, memopv16i32, i512mem, +defm VPERMT2D : avx512_perm_table_3src<0x7E, "d", VR512, loadv16i32, i512mem, X86VPermv3, v16i32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, memopv8i64, i512mem, +defm VPERMT2Q : avx512_perm_table_3src<0x7E, "q", VR512, loadv8i64, i512mem, X86VPermv3, v8i64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, memopv16f32, i512mem, +defm VPERMT2PS : avx512_perm_table_3src<0x7F, "ps", VR512, loadv16f32, i512mem, X86VPermv3, v16f32, VK16WM, v16i1, GR16>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, +defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, loadv8f64, i512mem, X86VPermv3, v8f64, VK8WM, v8i1, GR8>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -1198,35 +1324,40 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), // avx512_cmp_scalar - AVX512 CMPSS and CMPSD multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - Operand CC, SDNode OpNode, ValueType VT, - PatFrag ld_frag, string asm, string asm_alt> { + SDNode OpNode, ValueType VT, + PatFrag ld_frag, string Suffix> { def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], IIC_SSE_ALU_F32S_RR>, EVEX_4V; def rm : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VK1:$dst, (OpNode (VT RC:$src1), (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; + (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + !strconcat("vcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; + let mayLoad = 1 in def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), + !strconcat("vcmp", Suffix, + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; } } let Predicates = [HasAVX512] in { -defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, AVXCC, X86cmpms, f32, loadf32, - "vcmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vcmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - XS; -defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64, - "vcmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - "vcmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, - XD, VEX_W; +defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">, + XS; +defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">, + XD, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -1361,7 +1492,7 @@ def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), @@ -1369,7 +1500,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RR>, EVEX_4V; let mayLoad = 1 in def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), @@ -1378,7 +1509,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), @@ -1389,7 +1520,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, let mayLoad = 1 in def rmik : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), @@ -1402,25 +1533,27 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, - (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rmi_alt : AVX512AIi8<opc, MRMSrcMem, - (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik_alt : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in def rmik_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), @@ -1431,10 +1564,9 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, X86VectorVTInfo _> : avx512_icmp_cc<opc, Suffix, OpNode, _> { - let mayLoad = 1 in { def rmib : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - AVXCC:$cc), + AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, "}"), @@ -1444,7 +1576,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; def rmibk : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, AVXCC:$cc), + _.ScalarMemOp:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), @@ -1453,20 +1585,19 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)))], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; - } // Accept explicit immediate argument form instead of comparison code. - let isAsmParserOnly = 1, hasSideEffects = 0 in { + let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { def rmib_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, - i8imm:$cc), + u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, - _.ScalarMemOp:$src2, i8imm:$cc), + _.ScalarMemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), @@ -1519,46 +1650,97 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -// avx512_cmp_packed - compare packed instructions -multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, ValueType vt, - string suffix, Domain d> { - def rri : AVX512PIi8<0xC2, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>; - def rrib: AVX512PIi8<0xC2, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", suffix, - "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), - [], d>, EVEX_B; - def rmi : AVX512PIi8<0xC2, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [(set KRC:$dst, - (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; +multiclass avx512_vcmp_common<X86VectorVTInfo _> { + defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>; + + let mayLoad = 1 in { + defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)>; + + defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (X86cmpm (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>,EVEX_B; + } // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - !strconcat("vcmp", suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; - def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - !strconcat("vcmp", suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">; + + let mayLoad = 1 in { + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">; + + defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B; + } + } +} + +multiclass avx512_vcmp_sae<X86VectorVTInfo _> { + // comparison code form (VCMP[EQ/LT/LE/...] + defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2,{sae}", + (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_B; + + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $cc">, EVEX_B; + } +} + +multiclass avx512_vcmp<AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcmp_common<_.info512>, + avx512_vcmp_sae<_.info512>, EVEX_V512; + + } + let Predicates = [HasAVX512,HasVLX] in { + defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128; + defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256; } } -defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, v16f32, - "ps", SSEPackedSingle>, PS, EVEX_4V, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, v8f64, - "pd", SSEPackedDouble>, PD, EVEX_4V, VEX_W, EVEX_V512, - EVEX_CD8<64, CD8VF>; +defm VCMPPD : avx512_vcmp<avx512vl_f64_info>, + AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VCMPPS : avx512_vcmp<avx512vl_f32_info>, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), (COPY_TO_REGCLASS (VCMPPSZrri @@ -1576,30 +1758,7 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; -def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2, - (I8Imm imm:$cc)), GR16)>; - -def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2, - (I8Imm imm:$cc)), GR8)>; - -def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), - FROUND_CURRENT)), - (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2, - (I8Imm imm:$cc)), GR16)>; - -def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), - FROUND_CURRENT)), - (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2, - (I8Imm imm:$cc)), GR8)>; - +//----------------------------------------------------------------- // Mask register copy, including // - copy between mask registers // - load/store mask registers @@ -1607,17 +1766,18 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), // multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, - ValueType vvt, ValueType ivt, X86MemOperand x86memop> { + ValueType vvt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>; + [(set KRC:$dst, (vvt (load addr:$src)))]>; let mayStore = 1 in def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store KRC:$src, addr:$dst)]>; } } @@ -1633,27 +1793,25 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, } let Predicates = [HasDQI] in - defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8, - i8mem>, + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, VEX, PD; let Predicates = [HasAVX512] in - defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16, - i16mem>, + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, VEX, PS; let Predicates = [HasBWI] in { - defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32, - i32mem>, VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>, + VEX, PD, VEX_W; defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, VEX, XD; } let Predicates = [HasBWI] in { - defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64, - i64mem>, VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, VEX, XD, VEX_W; } @@ -1684,24 +1842,36 @@ let Predicates = [HasBWI] in { let Predicates = [HasDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVBmk addr:$dst, VK8:$src)>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (KMOVBkm addr:$src)>; } -let Predicates = [HasAVX512] in { - def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), - (KMOVWmk addr:$dst, VK16:$src)>; +let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; - def : Pat<(i1 (load addr:$src)), - (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; } +let Predicates = [HasAVX512] in { + def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), + (KMOVWmk addr:$dst, VK16:$src)>; + def : Pat<(i1 (load addr:$src)), + (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0), + (MOV8rm addr:$src), sub_8bit)), + (i16 1)), VK1)>; + def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), + (KMOVWkm addr:$src)>; +} let Predicates = [HasBWI] in { def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), (KMOVDmk addr:$dst, VK32:$src)>; + def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))), + (KMOVDkm addr:$src)>; } let Predicates = [HasBWI] in { def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), (KMOVQmk addr:$dst, VK64:$src)>; + def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))), + (KMOVQkm addr:$src)>; } let Predicates = [HasAVX512] in { @@ -1723,6 +1893,8 @@ let Predicates = [HasAVX512] in { def : Pat<(i32 (zext VK1:$src)), (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; + def : Pat<(i32 (anyext VK1:$src)), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; def : Pat<(i8 (zext VK1:$src)), (EXTRACT_SUBREG (AND32ri (KMOVWrk @@ -1748,17 +1920,18 @@ let Predicates = [HasBWI] in { // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512, NoDQI] in { // GR from/to 8-bit mask without native support def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), (COPY_TO_REGCLASS - (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), - VK8)>; + (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>; def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), sub_8bit)>; +} +let Predicates = [HasAVX512] in { def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>; def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), @@ -1815,21 +1988,24 @@ let Predicates = [HasBWI] in def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512, NoDQI] in { def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; - def : Pat<(not VK8:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; } +def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>; +def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>; // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd> { - let Predicates = [prd] in + Predicate prd, bit IsCommutable> { + let Predicates = [prd], isCommutable = IsCommutable in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -1837,40 +2013,25 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, } multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, bit IsCommutable> { defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, - HasDQI>, VEX_4V, VEX_L, PD; + HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - HasAVX512>, VEX_4V, VEX_L, PS; + HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, - HasBWI>, VEX_4V, VEX_L, VEX_W, PD; + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, - HasBWI>, VEX_4V, VEX_L, VEX_W, PS; + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; -let isCommutable = 1 in { - defm KAND : avx512_mask_binop_all<0x41, "kand", and>; - defm KOR : avx512_mask_binop_all<0x45, "kor", or>; - defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor>; - defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor>; -} -let isCommutable = 0 in - defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn>; - -def : Pat<(xor VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; - -def : Pat<(or VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; - -def : Pat<(and VK1:$src1, VK1:$src2), - (COPY_TO_REGCLASS (KANDWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; +defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; multiclass avx512_mask_binop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -1887,13 +2048,28 @@ defm : avx512_mask_binop_int<"kor", "KOR">; defm : avx512_mask_binop_int<"kxnor", "KXNOR">; defm : avx512_mask_binop_int<"kxor", "KXOR">; -// With AVX-512, 8-bit mask is promoted to 16-bit mask. multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { - let Predicates = [HasAVX512] in - def : Pat<(OpNode VK8:$src1, VK8:$src2), - (COPY_TO_REGCLASS - (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), - (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + // With AVX512F, 8-bit mask is promoted to 16-bit mask, + // for the DQI set, this type is legal and KxxxB instruction is used + let Predicates = [NoDQI] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + + // All types smaller than 8 bits require conversion anyway + def : Pat<(OpNode VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK2:$src1, VK2:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK4:$src1, VK4:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; } defm : avx512_binop_pat<and, KANDWrr>; @@ -1902,6 +2078,32 @@ defm : avx512_binop_pat<or, KORWrr>; defm : avx512_binop_pat<xnor, KXNORWrr>; defm : avx512_binop_pat<xor, KXORWrr>; +def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)), + (KXNORWrr VK16:$src1, VK16:$src2)>; +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (KXNORBrr VK8:$src1, VK8:$src2)>; +def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)), + (KXNORDrr VK32:$src1, VK32:$src2)>; +def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)), + (KXNORQrr VK64:$src1, VK64:$src2)>; + +let Predicates = [NoDQI] in +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + +def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>; + +def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>; + +def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + // Mask unpacking multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, RegisterClass KRC> { @@ -1944,19 +2146,24 @@ multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, VEX, PS; + let Predicates = [HasDQI] in + defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>, + VEX, PD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>, + VEX, PD, VEX_W; + } } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; -def : Pat<(X86cmp VK1:$src1, (i1 0)), - (KORTESTWrr (COPY_TO_REGCLASS VK1:$src1, VK16), - (COPY_TO_REGCLASS VK1:$src1, VK16))>; - // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512] in - def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), !strconcat(OpcodeStr, "\t{$imm, $src, $dst|$dst, $src, $imm}"), [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; @@ -1965,7 +2172,17 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, SDNode OpNode> { defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, TAPD, VEX_W; + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>, + VEX, TAPD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>, + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, + VEX, TAPD; + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -1982,6 +2199,8 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { multiclass avx512_mask_setop_w<PatFrag Val> { defm B : avx512_mask_setop<VK8, v8i1, Val>; defm W : avx512_mask_setop<VK16, v16i1, Val>; + defm D : avx512_mask_setop<VK32, v32i1, Val>; + defm Q : avx512_mask_setop<VK64, v64i1, Val>; } defm KSET0 : avx512_mask_setop_w<immAllZerosV>; @@ -1991,9 +2210,11 @@ defm KSET1 : avx512_mask_setop_w<immAllOnesV>; let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; + def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; + def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSET1W), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSET1W), VK1)>; + def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; + def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; } def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; @@ -2004,11 +2225,19 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; + +def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), + (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; + let Predicates = [HasVLX] in { def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), @@ -2016,181 +2245,201 @@ let Predicates = [HasVLX] in { } def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; + (v8i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + //===----------------------------------------------------------------------===// // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag, - RegisterClass KRC, RegisterClass RC, - ValueType vt, ValueType zvt, X86MemOperand memop, - Domain d, bit IsReMaterializable = 1> { -let hasSideEffects = 0 in { - def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + +multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag ld_frag, PatFrag mload, + bit IsReMaterializable = 1> { + let hasSideEffects = 0 in { + def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], - d>, EVEX; - def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), + _.ExeDomain>, EVEX; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; - } + "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>, + EVEX, EVEX_KZ; + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, SchedRW = [WriteLoad] in - def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src), + def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))], - d>, EVEX; - - let AddedComplexity = 20 in { - let Constraints = "$src0 = $dst", hasSideEffects = 0 in { - let hasSideEffects = 0 in - def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1), - !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", - "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt (vselect KRC:$mask, - (vt RC:$src1), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], + _.ExeDomain>, EVEX; + + let Constraints = "$src0 = $dst" in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K; let mayLoad = 1, SchedRW = [WriteLoad] in - def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, memop:$src1), + def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", "${dst} {${mask}}, $src1}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src1))), - (vt RC:$src0))))], - d>, EVEX, EVEX_K; + [(set _.RC:$dst, (_.VT + (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; } let mayLoad = 1, SchedRW = [WriteLoad] in - def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), - [(set RC:$dst, (vt - (vselect KRC:$mask, - (vt (bitconvert (ld_frag addr:$src))), - (vt (bitconvert (zvt immAllZerosV))))))], - d>, EVEX, EVEX_KZ; + def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# + "${dst} {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + _.ExeDomain>, EVEX, EVEX_KZ; } + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), + (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0, + _.KRCWM:$mask, addr:$ptr)>; } -multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, - string elty, string elsz, string vsz512, - string vsz256, string vsz128, Domain d, - Predicate prd, bit IsReMaterializable = 1> { +multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { let Predicates = [prd] in - defm Z : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<ValueType>("v"##vsz512##elty##elsz), v16i32, - !cast<X86MemOperand>(elty##"512mem"), d, - IsReMaterializable>, EVEX_V512; + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, + masked_load_aligned512, IsReMaterializable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz256##elty##elsz, "v4i64")), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<ValueType>("v"##vsz256##elty##elsz), v8i32, - !cast<X86MemOperand>(elty##"256mem"), d, - IsReMaterializable>, EVEX_V256; - - defm Z128 : avx512_load<opc, OpcodeStr, - !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), - "v"##vsz128##elty##elsz, "v2i64")), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<ValueType>("v"##vsz128##elty##elsz), v4i32, - !cast<X86MemOperand>(elty##"128mem"), d, - IsReMaterializable>, EVEX_V128; + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, + masked_load_aligned256, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, + masked_load_aligned128, IsReMaterializable>, EVEX_V128; } } +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V512; -multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, - ValueType OpVT, RegisterClass KRC, RegisterClass RC, - X86MemOperand memop, Domain d> { - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, - EVEX; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V128; + } +} + +multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag st_frag, PatFrag mstore> { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { + def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", [], + _.ExeDomain>, EVEX; let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, - EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; + def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), + OpcodeStr # + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + [], _.ExeDomain>, EVEX, EVEX_K; + def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # + "\t{$src, ${dst} {${mask}} {z}|" # + "${dst} {${mask}} {z}, $src}", + [], _.ExeDomain>, EVEX, EVEX_KZ; } let mayStore = 1 in { - def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX; + [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX; def mrk : AVX512PI<opc, MRMDestMem, (outs), - (ins memop:$dst, KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [], d>, EVEX, EVEX_K; + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K; } + + def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), + (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr, + _.KRCWM:$mask, _.RC:$src)>; } -multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat, - string st_suff_512, string st_suff_256, - string st_suff_128, string elty, string elsz, - string vsz512, string vsz256, string vsz128, - Domain d, Predicate prd> { +multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512), - !cast<ValueType>("v"##vsz512##elty##elsz), - !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, - !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512; + defm Z : avx512_store<opc, OpcodeStr, _.info512, store, + masked_store_unaligned>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256), - !cast<ValueType>("v"##vsz256##elty##elsz), - !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, - !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256; - - defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128), - !cast<ValueType>("v"##vsz128##elty##elsz), - !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, - !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128; + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, + masked_store_unaligned>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, + masked_store_unaligned>, EVEX_V128; } } -defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x29, "vmovaps", "alignedstore", - "512", "256", "", "f", "32", "16", "8", "4", - SSEPackedSingle, HasAVX512>, - PS, EVEX_CD8<32, CD8VF>; +multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, + masked_store_aligned512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, + masked_store_aligned256>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, + masked_store_aligned128>, EVEX_V128; + } +} -defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - avx512_store_vl<0x29, "vmovapd", "alignedstore", - "512", "256", "", "f", "64", "8", "4", "2", - SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, - avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32", - "16", "8", "4", SSEPackedSingle, HasAVX512>, +defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, PS, EVEX_CD8<32, CD8VF>; -defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512, 0>, - avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64", - "8", "4", "2", SSEPackedDouble, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), @@ -2200,6 +2449,22 @@ def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), + (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), + (VMOVAPDZrm addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), + (VMOVAPSZrm addr:$ptr)>; + def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), GR16:$mask), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), @@ -2209,6 +2474,16 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; +def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src), + GR16:$mask), + (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), + GR8:$mask), + (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2218,73 +2493,36 @@ def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), - (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), - (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, - (bc_v16f32 (v16i32 immAllZerosV)))), - (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), - (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8f64 (v16i32 immAllZerosV)))), - (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), - (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; +} + +defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, PD, EVEX_CD8<32, CD8VF>; -defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", - "512", "256", "", "i", "32", "16", "8", "4", - SSEPackedInt, HasAVX512>, - PD, EVEX_CD8<32, CD8VF>; - -defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqa64", "alignedstore", - "512", "256", "", "i", "64", "8", "4", "2", - SSEPackedInt, HasAVX512>, - PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8", - "64", "32", "16", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "", - "i", "8", "64", "32", "16", SSEPackedInt, +defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI>, XD, EVEX_CD8<8, CD8VF>; -defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16", - "32", "16", "8", SSEPackedInt, HasBWI>, - avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "", - "i", "16", "32", "16", "8", SSEPackedInt, +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; -defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32", - "16", "8", "4", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "", - "i", "32", "16", "8", "4", SSEPackedInt, +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, XS, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64", - "8", "4", "2", SSEPackedInt, HasAVX512>, - avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "", - "i", "64", "8", "4", "2", SSEPackedInt, +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, @@ -2322,37 +2560,8 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), - (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), - (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, - (bc_v8i64 (v16i32 immAllZerosV)))), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; - -def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), - (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; - -def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), - (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; - -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), - (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; - -// SKX replacement -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; - -// KNL replacement +// NoVLX patterns +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), @@ -2361,7 +2570,7 @@ def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - +} // Move Int Doubleword to Packed Double Int // @@ -2816,7 +3025,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - "", itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in @@ -2825,7 +3034,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V; } @@ -2841,7 +3050,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - "", itins.rm>, + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; } @@ -2934,60 +3143,36 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, itins, HasBWI, IsCommutable>; } -multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, - ValueType SrcVT, RegisterClass KRC, RegisterClass RC, - PatFrag memop_frag, X86MemOperand x86memop, - PatFrag scalar_mfrag, X86MemOperand x86scalar_mop, - string BrdcstStr, OpndItins itins, bit IsCommutable = 0> { - let isCommutable = IsCommutable in - { - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; - def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], itins.rr>, EVEX_4V, EVEX_K; - def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}} {z}" , - "|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rr>, EVEX_4V, EVEX_KZ; - } +multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst, bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2))), + itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; let mayLoad = 1 in { - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; - def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], itins.rm>, EVEX_4V, EVEX_K; - def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rm>, EVEX_4V, EVEX_KZ; - def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), - [], itins.rm>, EVEX_4V, EVEX_B; - def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", - BrdcstStr, "}"), - [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K; - def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", - BrdcstStr, "}"), - [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ; + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2)))), + itins.rm>, + AVX512BIBase, EVEX_4V; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Dst.BroadcastStr##", $src1", + "$src1, ${src2}"##_Dst.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Dst.VT (X86VBroadcast + (_Dst.ScalarLdFrag addr:$src2)))))), + itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; } } @@ -2995,6 +3180,14 @@ defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, SSE_INTALU_ITINS_P, 1>; defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, SSE_INTALU_ITINS_P, 0>; +defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, + SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, @@ -3002,24 +3195,97 @@ defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; -defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, + SDNode OpNode, bit IsCommutable = 0> { -defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTMUL_ITINS_P, 1>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; + defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v16i32_info, v8i64_info, IsCommutable>, + EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v8i32x_info, v4i64x_info, IsCommutable>, + EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; + defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v4i32x_info, v2i64x_info, IsCommutable>, + EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; + } +} -def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))), - (VPMULUDQZrr VR512:$src1, VR512:$src2)>; +defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, + X86pmuldq, 1>,T8PD; +defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, + X86pmuludq, 1>; -def : Pat<(v8i64 (int_x86_avx512_mask_pmulu_dq_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMULUDQZrr VR512:$src1, VR512:$src2)>; -def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPMULDQZrr VR512:$src1, VR512:$src2)>; +multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { + let mayLoad = 1 in { + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Src.BroadcastStr##", $src1", + "$src1, ${src2}"##_Src.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast + (_Src.ScalarLdFrag addr:$src2))))))>, + EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; + } +} + +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; + let mayLoad = 1 in { + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2))))>, + EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>; + } +} + +multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info>, EVEX_V128; + } +} +multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, + v64i8_info>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info, + v32i8x_info>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info, + v16i8x_info>, EVEX_V128; + } +} +let Predicates = [HasBWI] in { + defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; + defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; + defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; + defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; +} defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; @@ -3094,16 +3360,16 @@ multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, d>, EVEX_4V; } -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64, +defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64, VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64, +defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64, VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64, +defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64, VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64, +defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64, VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3123,16 +3389,16 @@ multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, IIC_SSE_UNPCK>, EVEX_4V; } defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, loadv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, loadv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, memopv16i32, i512mem>, EVEX_V512, + VR512, loadv16i32, i512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, memopv8i64, i512mem>, EVEX_V512, + VR512, loadv8i64, i512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - PSHUFD @@ -3142,14 +3408,14 @@ multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, EVEX; def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + (ins x86memop:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, @@ -3157,7 +3423,7 @@ multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, (i8 imm:$src2))))]>, EVEX; } -defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, +defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, loadv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; //===----------------------------------------------------------------------===// @@ -3171,32 +3437,99 @@ defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_INTALU_ITINS_P, HasAVX512, 1>; defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, - SSE_INTALU_ITINS_P, HasAVX512, 1>; + SSE_INTALU_ITINS_P, HasAVX512, 0>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic //===----------------------------------------------------------------------===// +multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, OpndItins itins, + bit IsCommutable> { -multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - SizeItins itins> { - defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32X, - f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG, - EVEX_CD8<32, CD8VT1>; - defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64X, - f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG, - EVEX_CD8<64, CD8VT1>; -} + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT)), + itins.rr, IsCommutable>; -let isCommutable = 1 in { -defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>; -defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>; -defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>; -defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>; + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)), + itins.rm, IsCommutable>; + let isCodeGenOnly = 1, isCommutable = IsCommutable, + Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], + itins.rr>; + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))], itins.rr>; + } } -let isCommutable = 0 in { -defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>; -defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; + +multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$rc)), itins.rr, IsCommutable>, + EVEX_B, EVEX_RC; } +multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>; +defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>; +defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> { @@ -3219,7 +3552,26 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, }//let mayLoad = 1 } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _, bit IsCommutable> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, + EVEX_4V, EVEX_B, EVEX_RC; +} + + +multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _, bit IsCommutable> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, + EVEX_4V, EVEX_B; +} + +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>, EVEX_V512, PS, @@ -3245,67 +3597,121 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, } } -defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; -defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; -defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; -defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; -defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; - -def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), - (i16 -1), FROUND_CURRENT)), - (VMAXPSZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), - (i8 -1), FROUND_CURRENT)), - (VMAXPDZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), - (i16 -1), FROUND_CURRENT)), - (VMINPSZrr VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), - (i8 -1), FROUND_CURRENT)), - (VMINPDZrr VR512:$src1, VR512:$src2)>; +multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, + avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, + avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, + avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, + avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>, + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>, + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>; +let Predicates = [HasDQI] in { + defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, 1>; + defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>; + defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, 1>; + defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>; +} + //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX512PI<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], - SSEPackedInt>, EVEX_4V; - def rm : AVX512PI<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), - (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V; +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_4V; + let mayLoad = 1 in + defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))>, + EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; } -defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem, - memopv16i32, X86testm, v16i32>, T8PD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, - memopv8i64, X86testm, v8i64>, T8PD, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; +multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} +multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; -let Predicates = [HasCDI] in { -defm VPTESTNMDZ : avx512_vptest<0x27, "vptestnmd", VK16, VR512, f512mem, - memopv16i32, X86testnm, v16i32>, T8XS, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPTESTNMQZ : avx512_vptest<0x27, "vptestnmq", VK8, VR512, f512mem, - memopv8i64, X86testnm, v8i64>, T8XS, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } +} + +multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, + avx512vl_i32_info>; + defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, VEX_W; +} + +multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in { + defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>, + EVEX_V512, VEX_W; + defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>, + EVEX_V512; + } + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>, + EVEX_V256; + defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>, + EVEX_V128; + } } +multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, + SDNode OpNode> : + avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>, + avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>; + +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD; +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS; + def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))), (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>; @@ -3320,69 +3726,130 @@ def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), - (ins _.RC:$src1, i8imm:$src2), OpcodeStr, + (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + let mayLoad = 1 in defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), - (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr, + (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2))), + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; +} + +multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let mayLoad = 1 in + defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V, EVEX_B; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { // src2 is always 128-bit defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, (bc_frag (memopv2i64 addr:$src2)))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; + (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, + EVEX_4V; } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { - defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512; + ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info512>, EVEX_V512, + EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info256>, EVEX_V256, + EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info128>, EVEX_V128, + EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; + } } -multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr, - SDNode OpNode> { +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, + string OpcodeStr, SDNode OpNode> { defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, - v16i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info, HasAVX512>; defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, - v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16, + avx512vl_i16_info, HasBWI>; +} + +multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, EVEX_V256; + defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, EVEX_V128; + } } -defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_w<bits<8> opcw, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX, HasBWI] in { + defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v16i16x_info>, EVEX_V256; + defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v8i16x_info>, EVEX_V128; + } +} -defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - v16i32_info>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +} + +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>; + +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>; + +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>; -defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - v16i32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - v8i64_info>, EVEX_V512, - EVEX_CD8<64, CD8VF>, VEX_W; +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>; -defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; -defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; -defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts @@ -3393,30 +3860,72 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2))), - " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V; + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; } +multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))), + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _> { - defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } } multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, - avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>; + avx512vl_i32_info>; defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, - avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; + avx512vl_i64_info>, VEX_W; +} + +multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>, + EVEX_V512, VEX_W; + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + } } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, + avx512_var_shift_w<0x12, "vpsllvw", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, + avx512_var_shift_w<0x11, "vpsravw", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, + avx512_var_shift_w<0x10, "vpsrlvw", srl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP @@ -3433,7 +3942,7 @@ def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>, +defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), (VMOVDDUPZrm addr:$src)>; @@ -3454,17 +3963,17 @@ multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, } defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, memopv16f32, f512mem>, EVEX_V512, + v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), (VMOVSHDUPZrm addr:$src)>; def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))), +def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), (VMOVSLDUPZrm addr:$src)>; //===----------------------------------------------------------------------===// @@ -3516,28 +4025,51 @@ multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), - OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, AVX512FMA3Base, EVEX_B; } } // Constraints = "$src1 = $dst" +let Constraints = "$src1 = $dst" in { +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3_round_rrb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, + SDPatternOperator OpNode> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; + } +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3_round_forms<bits<8> opc213, string OpcodeStr, + X86VectorVTInfo VTI, SDPatternOperator OpNode> { + defm v213r : avx512_fma3_round_rrb<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), + VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; +} + multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231, string OpcodeStr, X86VectorVTInfo VTI, SDPatternOperator OpNode> { defm v213r : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; - defm v231r : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), VTI>, EVEX_CD8<VTI.EltSize, CD8VF>; } multiclass avx512_fma3p<bits<8> opc213, bits<8> opc231, string OpcodeStr, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, + SDPatternOperator OpNodeRnd> { let ExeDomain = SSEPackedSingle in { defm NAME##PSZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v16f32_info, OpNode>, EVEX_V512; + v16f32_info, OpNode>, + avx512_fma3_round_forms<opc213, OpcodeStr, + v16f32_info, OpNodeRnd>, EVEX_V512; defm NAME##PSZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, v8f32x_info, OpNode>, EVEX_V256; defm NAME##PSZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, @@ -3545,20 +4077,24 @@ let ExeDomain = SSEPackedSingle in { } let ExeDomain = SSEPackedDouble in { defm NAME##PDZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v8f64_info, OpNode>, EVEX_V512, VEX_W; + v8f64_info, OpNode>, + avx512_fma3_round_forms<opc213, OpcodeStr, v8f64_info, + OpNodeRnd>, EVEX_V512, VEX_W; defm NAME##PDZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v4f64x_info, OpNode>, EVEX_V256, VEX_W; + v4f64x_info, OpNode>, + EVEX_V256, VEX_W; defm NAME##PDZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v2f64x_info, OpNode>, EVEX_V128, VEX_W; + v2f64x_info, OpNode>, + EVEX_V128, VEX_W; } } -defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; -defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; -defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; -defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; -defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; -defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; let Constraints = "$src1 = $dst" in { multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3567,7 +4103,7 @@ multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3)))]>; def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2), @@ -3580,26 +4116,29 @@ multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, } } // Constraints = "$src1 = $dst" - -multiclass avx512_fma3p_m132_f<bits<8> opc, - string OpcodeStr, - SDNode OpNode> { +multiclass avx512_fma3p_m132_f<bits<8> opc, string OpcodeStr, SDNode OpNode> { let ExeDomain = SSEPackedSingle in { defm NAME##PSZ : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode,v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; + OpNode,v16f32_info>, EVEX_V512, + EVEX_CD8<32, CD8VF>; defm NAME##PSZ256 : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; + OpNode, v8f32x_info>, EVEX_V256, + EVEX_CD8<32, CD8VF>; defm NAME##PSZ128 : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; + OpNode, v4f32x_info>, EVEX_V128, + EVEX_CD8<32, CD8VF>; } let ExeDomain = SSEPackedDouble in { defm NAME##PDZ : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + OpNode, v8f64_info>, EVEX_V512, + VEX_W, EVEX_CD8<32, CD8VF>; defm NAME##PDZ256 : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + OpNode, v4f64x_info>, EVEX_V256, + VEX_W, EVEX_CD8<32, CD8VF>; defm NAME##PDZ128 : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + OpNode, v2f64x_info>, EVEX_V128, + VEX_W, EVEX_CD8<32, CD8VF>; } } @@ -3610,7 +4149,6 @@ defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; - // Scalar FMA let Constraints = "$src1 = $dst" in { multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3633,7 +4171,6 @@ multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpVT (OpNode RC:$src2, RC:$src1, (mem_frag addr:$src3))))]>; } - } // Constraints = "$src1 = $dst" defm VFMADDSSZ : avx512_fma3s_rm<0xA9, "vfmadd213ss", X86Fmadd, FR32X, @@ -3670,6 +4207,7 @@ let hasSideEffects = 0 in { EVEX_4V; } // hasSideEffects = 0 } + let Predicates = [HasAVX512] in { defm VCVTSI2SSZ : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}">, XS, VEX_LIG, EVEX_CD8<32, CD8VT1>; @@ -3951,12 +4489,12 @@ let hasSideEffects = 0 in { } defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround, - memopv8f64, f512mem, v8f32, v8f64, + loadv8f64, f512mem, v8f32, v8f64, SSEPackedSingle>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, - memopv4f64, f256mem, v8f64, v8f32, + loadv4f64, f256mem, v8f64, v8f32, SSEPackedDouble>, EVEX_V512, PS, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), @@ -3975,27 +4513,27 @@ def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), //===----------------------------------------------------------------------===// defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, - memopv8i64, i512mem, v16f32, v16i32, + loadv8i64, i512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, - memopv4i64, i256mem, v8f64, v8i32, + loadv4i64, i256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, - memopv16f32, f512mem, v16i32, v16f32, + loadv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + loadv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, - memopv16f32, f512mem, v16i32, v16f32, + loadv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -4005,7 +4543,7 @@ def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src), (VCVTTPS2UDQZrr VR512:$src)>; defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, - memopv8f64, f512mem, v8i32, v8f64, + loadv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; @@ -4015,12 +4553,12 @@ def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), (VCVTTPD2UDQZrr VR512:$src)>; defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, - memopv4i64, f256mem, v8f64, v8i32, + loadv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, - memopv16i32, f512mem, v16f32, v16i32, + loadv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; @@ -4075,10 +4613,10 @@ let hasSideEffects = 0 in { } defm VCVTPS2DQZ : avx512_vcvt_fp2int<0x5B, "vcvtps2dq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, PD, + loadv16f32, f512mem, SSEPackedSingle>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2DQZ : avx512_vcvt_fp2int<0xE6, "vcvtpd2dq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, + loadv8f64, f512mem, SSEPackedDouble>, XD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2dq_512 (v16f32 VR512:$src), @@ -4090,10 +4628,10 @@ def : Pat <(v8i32 (int_x86_avx512_mask_cvtpd2dq_512 (v8f64 VR512:$src), (VCVTPD2DQZrrb VR512:$src, imm:$rc)>; defm VCVTPS2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtps2udq", VR512, VR512, - memopv16f32, f512mem, SSEPackedSingle>, + loadv16f32, f512mem, SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; defm VCVTPD2UDQZ : avx512_vcvt_fp2int<0x79, "vcvtpd2udq", VR512, VR256X, - memopv8f64, f512mem, SSEPackedDouble>, VEX_W, + loadv8f64, f512mem, SSEPackedDouble>, VEX_W, PS, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat <(v16i32 (int_x86_avx512_mask_cvtps2udq_512 (v16f32 VR512:$src), @@ -4127,12 +4665,12 @@ multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC, multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, X86MemOperand x86memop> { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), - (ins srcRC:$src1, i32i8imm:$src2), + (ins srcRC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), + (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } @@ -4299,9 +4837,9 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, - "$src2, $src1", "$src1, $src2", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; + (i32 FROUND_NO_EXC))>, EVEX_B; defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, @@ -4333,9 +4871,8 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, - "$src", "$src", - (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), - "{sae}">, EVEX_B; + "{sae}, $src", "$src, {sae}", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B; defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", @@ -4523,107 +5060,6 @@ let Predicates = [HasAVX512] in { } -multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int, - CD8VForm VForm> { -let ExeDomain = SSEPackedSingle in { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def PSr : AVX512AIi8<opcps, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>; - - // Vector intrinsic operation, mem - def PSm : AVX512AIi8<opcps, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, - EVEX_CD8<32, VForm>; -} // ExeDomain = SSEPackedSingle - -let ExeDomain = SSEPackedDouble in { - // Vector intrinsic operation, reg - def PDr : AVX512AIi8<opcpd, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>; - - // Vector intrinsic operation, mem - def PDm : AVX512AIi8<opcpd, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), - !strconcat(OpcodeStr, - "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, - EVEX_CD8<64, VForm>; -} // ExeDomain = SSEPackedDouble -} - -multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int> { -let ExeDomain = GenericDomain in { - // Operation, reg. - let hasSideEffects = 0 in - def SSr : AVX512AIi8<opcss, MRMSrcReg, - (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in - def SSr_Int : AVX512AIi8<opcss, MRMSrcReg, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>; - - // Intrinsic operation, mem. - def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, - sse_load_f32:$src2, imm:$src3))]>, - EVEX_CD8<32, CD8VT1>; - - // Operation, reg. - let hasSideEffects = 0 in - def SDr : AVX512AIi8<opcsd, MRMSrcReg, - (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, VEX_W; - - // Intrinsic operation, reg. - let isCodeGenOnly = 1 in - def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>, - VEX_W; - - // Intrinsic operation, mem. - def SDm : AVX512AIi8<opcsd, MRMSrcMem, - (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>, - VEX_W, EVEX_CD8<64, CD8VT1>; -} // ExeDomain = GenericDomain -} - multiclass avx512_rndscale<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, PatFrag mem_frag, Domain d> { @@ -4631,23 +5067,22 @@ let ExeDomain = d in { // Intrinsic operation, reg. // Vector intrinsic operation, reg def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; // Vector intrinsic operation, mem def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; } // ExeDomain } - defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - memopv16f32, SSEPackedSingle>, EVEX_V512, + loadv16f32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), @@ -4657,7 +5092,7 @@ def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - memopv8f64, SSEPackedDouble>, EVEX_V512, + loadv8f64, SSEPackedDouble>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), @@ -4665,50 +5100,72 @@ def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), FROUND_CURRENT)), (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; -multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, - Operand x86memop, RegisterClass RC, Domain d> { -let ExeDomain = d in { - def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; +multiclass +avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i32i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX_4V; -} // ExeDomain + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}", + (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; + + let mayLoad = 1 in + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScale (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + } + let Predicates = [HasAVX512] in { + def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>; + def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>; + def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>; + + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x1))), _.FRC)>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x3))), _.FRC)>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0xc))), _.FRC)>; + } } -defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, - SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; - -defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, - SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; - -def : Pat<(ffloor FR32X:$src), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>; -def : Pat<(f64 (ffloor FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>; -def : Pat<(f32 (fnearbyint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>; -def : Pat<(f64 (fnearbyint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>; -def : Pat<(f32 (fceil FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>; -def : Pat<(f64 (fceil FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>; -def : Pat<(f32 (frint FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>; -def : Pat<(f64 (frint FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>; -def : Pat<(f32 (ftrunc FR32X:$src)), - (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>; -def : Pat<(f64 (ftrunc FR64X:$src)), - (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>; +defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; + +let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; def : Pat<(v16f32 (fnearbyint VR512:$src)), @@ -4730,7 +5187,7 @@ def : Pat<(v8f64 (frint VR512:$src)), (VRNDSCALEPDZr VR512:$src, (i32 0x4))>; def : Pat<(v8f64 (ftrunc VR512:$src)), (VRNDSCALEPDZr VR512:$src, (i32 0x3))>; - +} //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- @@ -4812,151 +5269,224 @@ def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; -multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode, - PatFrag mem_frag, X86MemOperand x86memop, - ValueType OpVT, ValueType InVT> { +multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, + X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{ - def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), - (ins SrcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX; + defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, + EVEX; - def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), - (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), - []>, EVEX, EVEX_K; + let mayLoad = 1 in { + defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins x86memop:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (LdFrag addr:$src))>, + EVEX; + } +} - def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), - (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; +multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasBWI] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), - (ins x86memop:$src), - !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>, - EVEX; - - def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), - []>, - EVEX, EVEX_K; - - def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, - EVEX, EVEX_KZ; + defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasBWI] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info, + v32i8x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + v16i8x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v16i8x_info, i16mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v16i8x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + v8i16x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + v16i16x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v8i16x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v8i16x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512; } } -defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, - EVEX_CD8<8, CD8VQ>; -defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, - EVEX_CD8<8, CD8VO>; -defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, - EVEX_CD8<16, CD8VH>; -defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, - EVEX_CD8<16, CD8VQ>; -defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, - EVEX_CD8<32, CD8VH>; - -defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512, - EVEX_CD8<8, CD8VQ>; -defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512, - EVEX_CD8<8, CD8VO>; -defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512, - EVEX_CD8<16, CD8VH>; -defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext, - memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512, - EVEX_CD8<16, CD8VQ>; -defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext, - memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512, - EVEX_CD8<32, CD8VH>; +multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { + + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v4i32x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v4i32x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v8i32x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; + } +} + +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">; + + +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations -multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand memop> { -let mayLoad = 1, - Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), - (ins RC:$src1, KRC:$mask, memop:$src2), +multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag GatherNode> { + let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), + (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; + [(set _.RC:$dst, _.KRCWM:$mask_wb, + (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, + vectoraddr:$src2))]>, EVEX, EVEX_K, + EVEX_CD8<_.EltSize, CD8VT1>; } let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, + mgatherv8i32>, EVEX_V512, VEX_W; +defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, + mgatherv8i64>, EVEX_V512, VEX_W; } let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, + mgatherv16i32>, EVEX_V512; +defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, + mgatherv8i64>, EVEX_V512; } -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, + mgatherv8i32>, EVEX_V512, VEX_W; +defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, + mgatherv16i32>, EVEX_V512; -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, + mgatherv8i64>, EVEX_V512, VEX_W; +defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, + mgatherv8i64>, EVEX_V512; + +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag ScatterNode> { -multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand memop> { let mayStore = 1, Constraints = "$mask = $mask_wb" in - def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), - (ins memop:$dst, KRC:$mask, RC:$src2), + + def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), + (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), + _.KRCWM:$mask, vectoraddr:$dst))]>, + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } let ExeDomain = SSEPackedDouble in { -defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem, + mscatterv8i32>, EVEX_V512, VEX_W; +defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem, + mscatterv8i64>, EVEX_V512, VEX_W; } let ExeDomain = SSEPackedSingle in { -defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem, + mscatterv16i32>, EVEX_V512; +defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem, + mscatterv8i64>, EVEX_V512; } -defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem, + mscatterv8i32>, EVEX_V512, VEX_W; +defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem, + mscatterv16i32>, EVEX_V512; -defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; +defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem, + mscatterv8i64>, EVEX_V512, VEX_W; +defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem, + mscatterv8i64>, EVEX_V512; // prefetch multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, @@ -5021,14 +5551,14 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string OpcodeStr, PatFrag mem_frag, Domain d> { def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, @@ -5036,26 +5566,26 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, EVEX_4V, Sched<[WriteShuffle]>; } -defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32, +defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32, SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64, +defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64, SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v16i32 (X86Shufp VR512:$src1, - (memopv16i32 addr:$src2), (i8 imm:$imm))), + (loadv16i32 addr:$src2), (i8 imm:$imm))), (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; def : Pat<(v8i64 (X86Shufp VR512:$src1, - (memopv8i64 addr:$src2), (i8 imm:$imm))), + (loadv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; multiclass avx512_valign<X86VectorVTInfo _> { defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), "valign"##_.Suffix, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, @@ -5068,7 +5598,7 @@ multiclass avx512_valign<X86VectorVTInfo _> { let mayLoad = 1 in def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), !strconcat("valign"##_.Suffix, "\t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}"), @@ -5156,14 +5686,17 @@ multiclass avx512_conflict<bits<8> opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { + let hasSideEffects = 0 in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), []>, EVEX; + let mayLoad = 1 in def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), []>, EVEX; + let mayLoad = 1 in def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86scalar_mop:$src), !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, @@ -5174,11 +5707,13 @@ multiclass avx512_conflict<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; + let mayLoad = 1 in def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; + let mayLoad = 1 in def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86scalar_mop:$src), !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, @@ -5192,17 +5727,20 @@ multiclass avx512_conflict<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; + let mayLoad = 1 in def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; + let mayLoad = 1 in def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), []>, EVEX, EVEX_K, EVEX_B; - } + } + } } let Predicates = [HasCDI] in { @@ -5249,11 +5787,11 @@ def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, (VPLZCNTQrrk VR512:$src1, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; -def : Pat<(v16i32 (ctlz (memopv16i32 addr:$src))), +def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))), (VPLZCNTDrm addr:$src)>; def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (memopv8i64 addr:$src))), +def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))), (VPLZCNTQrm addr:$src)>; def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), (VPLZCNTQrr VR512:$src)>; @@ -5263,7 +5801,14 @@ def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; def : Pat<(store VK1:$src, addr:$dst), - (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>; + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(store VK8:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), (truncstore node:$val, node:$ptr), [{ @@ -5274,7 +5819,7 @@ def : Pat<(truncstorei1 GR8:$src, addr:$dst), (MOV8mr addr:$dst, GR8:$src)>; multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { -def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), +def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; } @@ -5303,6 +5848,35 @@ multiclass avx512_convert_mask_to_vector<string OpcodeStr> { defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; +multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX; +} + +multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { +let Predicates = [prd] in + defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>, + EVEX_V256; + defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>, + EVEX_V128; + } +} + +defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m", + avx512vl_i8_info, HasBWI>; +defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m", + avx512vl_i16_info, HasBWI>, VEX_W; +defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m", + avx512vl_i32_info, HasDQI>; +defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", + avx512vl_i64_info, HasDQI>, VEX_W; + //===----------------------------------------------------------------------===// // AVX-512 - COMPRESS and EXPAND // diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 78efc4d..5e19ad4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1216,10 +1216,10 @@ def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), let isCompare = 1 in { let Defs = [EFLAGS] in { let isCommutable = 1 in { - def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; - def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; - def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; - def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>; } // isCommutable def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 880c982..912a0fb 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -17,12 +17,12 @@ def GetLo32XForm : SDNodeXForm<imm, [{ // Transformation function: get the low 32 bits. - return getI32Imm((unsigned)N->getZExtValue()); + return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N)); }]>; def GetLo8XForm : SDNodeXForm<imm, [{ // Transformation function: get the low 8 bits. - return getI8Imm((uint8_t)N->getZExtValue()); + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); }]>; @@ -475,59 +475,54 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions -// X86 doesn't have 8-bit conditional moves. Use a customInserter to -// emit control flow. An alternative to this is to mark i8 SELECT as Promote, -// however that requires promoting the operands, and can induce additional -// i8 register pressure. -let usesCustomInserter = 1, Uses = [EFLAGS] in { -def CMOV_GR8 : I<0, Pseudo, - (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), - "#CMOV_GR8 PSEUDO!", - [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, - imm:$cond, EFLAGS))]>; - -let Predicates = [NoCMov] in { -def CMOV_GR32 : I<0, Pseudo, - (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), - "#CMOV_GR32* PSEUDO!", - [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; -def CMOV_GR16 : I<0, Pseudo, - (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), - "#CMOV_GR16* PSEUDO!", - [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; -} // Predicates = [NoCMov] - -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE1. -let Predicates = [FPStackf32] in -def CMOV_RFP32 : I<0, Pseudo, - (outs RFP32:$dst), - (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), - "#CMOV_RFP32 PSEUDO!", - [(set RFP32:$dst, - (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, - EFLAGS))]>; -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE2. -let Predicates = [FPStackf64] in -def CMOV_RFP64 : I<0, Pseudo, - (outs RFP64:$dst), - (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), - "#CMOV_RFP64 PSEUDO!", - [(set RFP64:$dst, - (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, - EFLAGS))]>; -def CMOV_RFP80 : I<0, Pseudo, - (outs RFP80:$dst), - (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), - "#CMOV_RFP80 PSEUDO!", - [(set RFP80:$dst, - (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, - EFLAGS))]>; -} // UsesCustomInserter = 1, Uses = [EFLAGS] +// CMOV* - Used to implement the SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { + def CMOV#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), + "#CMOV_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + EFLAGS)))]>; +} +let usesCustomInserter = 1, Uses = [EFLAGS] in { + // X86 doesn't have 8-bit conditional moves. Use a customInserter to + // emit control flow. An alternative to this is to mark i8 SELECT as Promote, + // however that requires promoting the operands, and can induce additional + // i8 register pressure. + defm _GR8 : CMOVrr_PSEUDO<GR8, i8>; + + let Predicates = [NoCMov] in { + defm _GR32 : CMOVrr_PSEUDO<GR32, i32>; + defm _GR16 : CMOVrr_PSEUDO<GR16, i16>; + } // Predicates = [NoCMov] + + // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no + // SSE1/SSE2. + let Predicates = [FPStackf32] in + defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>; + + let Predicates = [FPStackf64] in + defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>; + + defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; + + defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; + defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; + defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; + defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>; + defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>; + defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>; + defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>; + defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>; + defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>; + defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>; + defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>; + defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>; + defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>; +} // usesCustomInserter = 1, Uses = [EFLAGS] //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions @@ -863,79 +858,6 @@ def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; -//===----------------------------------------------------------------------===// -// Conditional Move Pseudo Instructions. -//===----------------------------------------------------------------------===// - -// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after -// instruction selection into a branch sequence. -let Uses = [EFLAGS], usesCustomInserter = 1 in { - def CMOV_FR32 : I<0, Pseudo, - (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), - "#CMOV_FR32 PSEUDO!", - [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, - EFLAGS))]>; - def CMOV_FR64 : I<0, Pseudo, - (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), - "#CMOV_FR64 PSEUDO!", - [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, - EFLAGS))]>; - def CMOV_V4F32 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V4F32 PSEUDO!", - [(set VR128:$dst, - (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2F64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2F64 PSEUDO!", - [(set VR128:$dst, - (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V2I64 : I<0, Pseudo, - (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), - "#CMOV_V2I64 PSEUDO!", - [(set VR128:$dst, - (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F32 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V8F32 PSEUDO!", - [(set VR256:$dst, - (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4F64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4F64 PSEUDO!", - [(set VR256:$dst, - (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V4I64 : I<0, Pseudo, - (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), - "#CMOV_V4I64 PSEUDO!", - [(set VR256:$dst, - (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8I64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8I64 PSEUDO!", - [(set VR512:$dst, - (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V8F64 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V8F64 PSEUDO!", - [(set VR512:$dst, - (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; - def CMOV_V16F32 : I<0, Pseudo, - (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), - "#CMOV_V16F32 PSEUDO!", - [(set VR512:$dst, - (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, - EFLAGS)))]>; -} - //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules @@ -1065,12 +987,12 @@ def : Pat<(X86tcret (load addr:$dst), imm:$off), Requires<[Not64BitMode, IsNotPIC]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), - (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + (TCRETURNdi tglobaladdr:$dst, imm:$off)>, + Requires<[NotLP64]>; def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, @@ -1084,11 +1006,11 @@ def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), (TCRETURNdi64 texternalsym:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; // Normal calls, with various flavors of addresses. def : Pat<(X86call (i32 tglobaladdr:$dst)), @@ -1142,11 +1064,12 @@ defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; // zextload bool -> zextload byte -def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; -def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; -def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>; def : Pat<(zextloadi64i1 addr:$src), - (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + (SUBREG_TO_REG (i64 0), + (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1314,7 +1237,11 @@ def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), // with implicit zero-extension instead of a 64-bit and if the immediate has at // least 32 bits of leading zeros. If in addition the last 32 bits can be // represented with a sign extension of a 8 bit constant, use that. +// This can also reduce instruction size by eliminating the need for the REX +// prefix. +// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32. +let AddedComplexity = 1 in { def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), (SUBREG_TO_REG (i64 0), @@ -1330,8 +1257,13 @@ def : Pat<(and GR64:$src, i64immZExt32:$imm), (EXTRACT_SUBREG GR64:$src, sub_32bit), (i32 (GetLo32XForm imm:$imm))), sub_32bit)>; +} // AddedComplexity = 1 +// AddedComplexity is needed due to the increased complexity on the +// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all +// the MOVZX patterns keeps thems together in DAGIsel tables. +let AddedComplexity = 1 in { // r & (2^16-1) ==> movz def : Pat<(and GR32:$src1, 0xffff), (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; @@ -1354,6 +1286,7 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), sub_32bit)>; // r & (2^16-1) ==> movz +let AddedComplexity = 1 in // Give priority over i64immZExt32. def : Pat<(and GR64:$src, 0xffff), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), @@ -1372,6 +1305,7 @@ def : Pat<(and GR16:$src1, 0xff), (EXTRACT_SUBREG (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, Requires<[In64BitMode]>; +} // AddedComplexity = 1 // sext_inreg patterns @@ -1563,8 +1497,12 @@ def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; // Helper imms that check if a mask doesn't change significant shift bits. -def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; -def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; +def immShift32 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 5; +}]>; +def immShift64 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 6; +}]>; // Shift amount is implicitly masked. multiclass MaskedShiftAmountPats<SDNode frag, string name> { diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 7baff19..6ab961f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -240,13 +240,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // mcinst. def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), (ins i32imm_pcrel:$dst), - "jmp\t$dst # TAILCALL", + "jmp\t$dst", [], IIC_JMP_REL>; def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), - "jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; } @@ -278,18 +278,6 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; } -let isCall = 1, isCodeGenOnly = 1 in - // __chkstk(MSVC): clobber R10, R11 and EFLAGS - // ___chkstk_ms(Mingw64): clobber R10, R11 and EFLAGS - // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. - let Defs = [RAX, R10, R11, RSP, EFLAGS], - Uses = [RSP] in { - def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst), - "call{q}\t$dst", [], IIC_CALL_RI>, - Requires<[IsWin64]>, Sched<[WriteJump]>; - } - let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, SchedRW = [WriteJump] in { @@ -302,13 +290,25 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURNmi64 : PseudoI<(outs), (ins i64mem_TC:$dst, i32imm:$offset), []>; - def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst), - "jmp\t$dst # TAILCALL", [], IIC_JMP_REL>; + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), + "jmp\t$dst", [], IIC_JMP_REL>; def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; let mayLoad = 1 in def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>; + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + // Win64 wants jumps leaving the function to have a REX_W prefix. + let hasREX_WPrefix = 1 in { + def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst), + "rex64 jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + let mayLoad = 1 in + def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + } } diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index eea4f17..c4b2d6d 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -139,11 +139,11 @@ def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>, - Sched<[WriteALU]>; + Sched<[WriteALU]>, Requires<[In64BitMode]>; def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), "movs{lq|xd}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>, - Sched<[WriteALULd]>; + Sched<[WriteALULd]>, Requires<[In64BitMode]>; // movzbq and movzwq encodings for the disassembler def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index 2993e42..7cc3b59 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -183,19 +183,24 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode, FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W; +// These patterns use the 123 ordering, instead of 213, even though +// they match the intrinsic to the 213 version of the instruction. +// This is because src1 is tied to dest, and the scalar intrinsics +// require the pass-through values to come from the first source +// operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SSr213r") - (COPY_TO_REGCLASS $src2, FR32), (COPY_TO_REGCLASS $src1, FR32), + (COPY_TO_REGCLASS $src2, FR32), (COPY_TO_REGCLASS $src3, FR32)), VR128)>; def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS (!cast<Instruction>(NAME#"SDr213r") - (COPY_TO_REGCLASS $src2, FR64), (COPY_TO_REGCLASS $src1, FR64), + (COPY_TO_REGCLASS $src2, FR64), (COPY_TO_REGCLASS $src3, FR64)), VR128)>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index e1abf26..0dd05d8 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -350,21 +350,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. -def CMOVB_F : FPI<0xDA, MRM0r, (outs RST:$op), (ins), +def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), "fcmovb\t{$op, %st(0)|st(0), $op}">; -def CMOVBE_F : FPI<0xDA, MRM2r, (outs RST:$op), (ins), +def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), "fcmovbe\t{$op, %st(0)|st(0), $op}">; -def CMOVE_F : FPI<0xDA, MRM1r, (outs RST:$op), (ins), +def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), "fcmove\t{$op, %st(0)|st(0), $op}">; -def CMOVP_F : FPI<0xDA, MRM3r, (outs RST:$op), (ins), +def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), "fcmovu\t{$op, %st(0)|st(0), $op}">; -def CMOVNB_F : FPI<0xDB, MRM0r, (outs RST:$op), (ins), +def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), "fcmovnb\t{$op, %st(0)|st(0), $op}">; -def CMOVNBE_F: FPI<0xDB, MRM2r, (outs RST:$op), (ins), +def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), "fcmovnbe\t{$op, %st(0)|st(0), $op}">; -def CMOVNE_F : FPI<0xDB, MRM1r, (outs RST:$op), (ins), +def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), "fcmovne\t{$op, %st(0)|st(0), $op}">; -def CMOVNP_F : FPI<0xDB, MRM3r, (outs RST:$op), (ins), +def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), "fcmovnu\t{$op, %st(0)|st(0), $op}">; } // Predicates = [HasCMov] @@ -636,12 +636,12 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), "fxsave\t$dst", [], IIC_FXSAVE>, TB; def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, + "fxsave64\t$dst", [], IIC_FXSAVE>, TB, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor\t$src", [], IIC_FXRSTOR>, TB; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor{q|64}\t$src", [], IIC_FXRSTOR>, TB, + "fxrstor64\t$src", [], IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; } // SchedRW diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index ba23878..331faf2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -34,23 +34,27 @@ def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; def MRM6m : Format<30>; def MRM7m : Format<31>; def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; -def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>; -def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>; -def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>; -def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>; -def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>; -def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>; -def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>; -def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>; -def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>; -def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>; -def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>; -def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>; -def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>; -def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>; -def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>; -def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>; -def MRM_FE : Format<83>; def MRM_FF : Format<84>; +def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>; +def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>; +def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>; +def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>; +def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>; +def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>; +def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>; +def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>; +def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>; +def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>; +def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>; +def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>; +def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>; +def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>; +def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>; +def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>; +def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>; +def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>; +def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>; +def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>; +def MRM_FF : Format<95>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -422,8 +426,9 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, // SI - SSE 1 & 2 scalar instructions class SI<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern, InstrItinClass itin = NoItinerary> - : I<o, F, outs, ins, asm, pattern, itin> { + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], @@ -437,12 +442,29 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm, asm)); } -// SIi8 - SSE 1 & 2 scalar instructions +// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512 +class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} +// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin> { let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], - !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], [UseSSE2]))); @@ -742,6 +764,14 @@ class AVX512BIi8Base : PD { Domain ExeDomain = SSEPackedInt; ImmType ImmT = Imm8; } +class AVX512PSIi8Base : PS { + Domain ExeDomain = SSEPackedSingle; + ImmType ImmT = Imm8; +} +class AVX512PDIi8Base : PD { + Domain ExeDomain = SSEPackedDouble; + ImmType ImmT = Imm8; +} class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 7069bd6..79d213c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -12,10 +12,23 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// MMX specific DAG Nodes. +//===----------------------------------------------------------------------===// + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +// GPR to low word of MMX. +def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>; + +//===----------------------------------------------------------------------===// // MMX Pattern Fragments //===----------------------------------------------------------------------===// def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; +def load_mvmmx : PatFrag<(ops node:$ptr), + (x86mmx (MMX_X86movw2d (load node:$ptr)))>; def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; //===----------------------------------------------------------------------===// @@ -134,14 +147,21 @@ def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; def X86CmpMaskCC : - SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<0>, SDTCisVec<1>, - SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; +def X86CmpMaskCCRound : + SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; -def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -160,16 +180,21 @@ def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>; def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; +def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>; +def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisVec<1>, - SDTCisSameAs<2, 1>]>>; + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCVecEltisVT<0, i1>, + SDTCisSameNumEltsAs<0, 1>]>>; def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisVec<1>, - SDTCisSameAs<2, 1>]>>; + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCVecEltisVT<0, i1>, + SDTCisSameNumEltsAs<0, 1>]>>; def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", @@ -201,12 +226,19 @@ def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; +def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; + def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<2>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisVec<0>, SDTCisInt<3>]>; +def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; @@ -247,8 +279,10 @@ def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; +def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSubVecOfVec<1, 0>]>, []>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; -def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, @@ -258,6 +292,13 @@ def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; +def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; + def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; @@ -265,12 +306,20 @@ def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>; +def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>; +def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>; +def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; +def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; +def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; + def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -346,6 +395,15 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; +// These are needed to match a scalar load that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def loadf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; +def loadf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ @@ -437,17 +495,15 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -// 256-bit memop pattern fragments -// NOTE: all 256-bit integer vector loads are promoted to v4i64 -def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; -def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; -def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; -// 512-bit memop pattern fragments -def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; -def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; -def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. @@ -484,6 +540,52 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; +def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i64 || + Mgt->getBasePtr().getValueType() == MVT::v8i64); + return false; +}]>; +def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v16i32 || + Mgt->getBasePtr().getValueType() == MVT::v16i32); + return false; +}]>; + +def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v8i32 || + Sc->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v8i64 || + Sc->getBasePtr().getValueType() == MVT::v8i64); + return false; +}]>; +def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v16i32 || + Sc->getBasePtr().getValueType() == MVT::v16i32); + return false; +}]>; + // 128-bit bitconvert pattern fragments def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; @@ -522,7 +624,7 @@ def fp32imm0 : PatLeaf<(f32 fpimm), [{ def I8Imm : SDNodeXForm<imm, [{ // Transformation function: get the low 8 bits. - return getI8Imm((uint8_t)N->getZExtValue()); + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); }]>; def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>; @@ -533,31 +635,31 @@ def FROUND_CURRENT : ImmLeaf<i32, [{ // BYTE_imm - Transform bit immediates into byte immediates. def BYTE_imm : SDNodeXForm<imm, [{ // Transformation function: imm >> 3 - return getI32Imm(N->getZExtValue() >> 3); + return getI32Imm(N->getZExtValue() >> 3, SDLoc(N)); }]>; // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACT128Immediate(N)); + return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N)); }]>; // INSERT_get_vinsert128_imm xform function: convert insert_subvector index to // VINSERTF128/VINSERTI128 imm. def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERT128Immediate(N)); + return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N)); }]>; // EXTRACT_get_vextract256_imm xform function: convert extract_subvector index // to VEXTRACTF64x4 imm. def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACT256Immediate(N)); + return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N)); }]>; // INSERT_get_vinsert256_imm xform function: convert insert_subvector index to // VINSERTF64x4 imm. def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERT256Immediate(N)); + return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N)); }]>; def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), @@ -587,3 +689,55 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, return X86::isVINSERT256Index(N); }], INSERT_get_vinsert256_imm>; +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 16; + return false; +}]>; + +def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 32; + return false; +}]>; + +def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 64; + return false; +}]>; + +def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return isa<MaskedLoadSDNode>(N); +}]>; + +def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 16; + return false; +}]>; + +def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 32; + return false; +}]>; + +def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 64; + return false; +}]>; + +def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return isa<MaskedStoreSDNode>(N); +}]>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b6b8ae..43decf7 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -91,7 +91,7 @@ enum { TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; -struct X86OpTblEntry { +struct X86MemoryFoldTableEntry { uint16_t RegOp; uint16_t MemOp; uint16_t Flags; @@ -104,9 +104,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), - Subtarget(STI), RI(STI) { + Subtarget(STI), RI(STI.getTargetTriple()) { - static const X86OpTblEntry OpTbl2Addr[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, { X86::ADC32ri8, X86::ADC32mi8, 0 }, { X86::ADC32rr, X86::ADC32mr, 0 }, @@ -269,17 +269,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR8rr, X86::XOR8mr, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { - unsigned RegOp = OpTbl2Addr[i].RegOp; - unsigned MemOp = OpTbl2Addr[i].MemOp; - unsigned Flags = OpTbl2Addr[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) { + unsigned RegOp = MemoryFoldTable2Addr[i].RegOp; + unsigned MemOp = MemoryFoldTable2Addr[i].MemOp; + unsigned Flags = MemoryFoldTable2Addr[i].Flags; AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, RegOp, MemOp, // Index 0, folded load and store, no alignment requirement. Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } - static const X86OpTblEntry OpTbl0[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, @@ -333,6 +333,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, + { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, + { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -351,10 +353,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, + { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + // AVX 128-bit versions of foldable instructions { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -367,6 +371,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, + { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE }, + { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE }, + // AVX 256-bit foldable instructions { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -374,6 +381,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, @@ -386,6 +394,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, @@ -397,6 +406,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (128-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, @@ -407,18 +417,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, + + // F16C foldable instructions + { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, + { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } }; - for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { - unsigned RegOp = OpTbl0[i].RegOp; - unsigned MemOp = OpTbl0[i].MemOp; - unsigned Flags = OpTbl0[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) { + unsigned RegOp = MemoryFoldTable0[i].RegOp; + unsigned MemOp = MemoryFoldTable0[i].MemOp; + unsigned Flags = MemoryFoldTable0[i].Flags; AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, RegOp, MemOp, TB_INDEX_0 | Flags); } - static const X86OpTblEntry OpTbl1[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::CMP16rr, X86::CMP16rm, 0 }, { X86::CMP32rr, X86::CMP32rm, 0 }, { X86::CMP64rr, X86::CMP64rm, 0 }, @@ -445,10 +459,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, @@ -488,13 +504,33 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, + { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, + { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, + { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, + { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, + { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 }, + { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 }, + { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 }, + { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 }, + { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 }, + { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 }, + { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 }, + { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 }, + { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 }, + { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 }, + { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 }, + { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, + { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, - { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, + { X86::RCPSSr, X86::RCPSSm, 0 }, + { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 }, + { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, + { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, - { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, @@ -510,6 +546,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 }, + { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, + { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 }, + { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 }, + { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 }, + { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, + { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 }, + { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 }, + { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 }, + { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 }, + + // 3DNow! version of foldable instructions + { X86::PF2IDrr, X86::PF2IDrm, 0 }, + { X86::PF2IWrr, X86::PF2IWrm, 0 }, + { X86::PFRCPrr, X86::PFRCPrm, 0 }, + { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 }, + { X86::PI2FDrr, X86::PI2FDrm, 0 }, + { X86::PI2FWrr, X86::PI2FWrm, 0 }, + { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, + // AVX 128-bit versions of foldable instructions { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, @@ -527,10 +585,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, @@ -541,8 +601,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, - { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, - { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, + { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, + { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, @@ -550,51 +610,147 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, + { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, + { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, + { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, + { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, + { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 }, + { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 }, + { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 }, + { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 }, + { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 }, + { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 }, + { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 }, + { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 }, + { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 }, + { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 }, + { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 }, + { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 }, { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, + { X86::VPTESTrr, X86::VPTESTrm, 0 }, { X86::VRCPPSr, X86::VRCPPSm, 0 }, - { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, + { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, + { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, - { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, + { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, + { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, - { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, + { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, + { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + { X86::VPTESTYrr, X86::VPTESTYrm, 0 }, { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, - { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, + { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, + { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, + { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, // AVX2 foldable instructions + + // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the + // VBROADCASTS{SD}rm memory instructions were available from AVX1. + // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction + // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions + // so they don't need an equivalent limitation. + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, + { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 }, + { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 }, + { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 }, + { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 }, + { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 }, + { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 }, + { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 }, + { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, + { X86::VPERMQYri, X86::VPERMQYmi, 0 }, + { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 }, + { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 }, + { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, + { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, + { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, + { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 }, + { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 }, + { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 }, + { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, + { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, + { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, + { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 }, { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, + // XOP foldable instructions + { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, + { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 }, + { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 }, + { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 }, + { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 }, + { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 }, + { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 }, + { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 }, + { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 }, + { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 }, + { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 }, + { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 }, + { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 }, + { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 }, + { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 }, + { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 }, + { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 }, + { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 }, + { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 }, + { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 }, + { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 }, + { X86::VPROTBri, X86::VPROTBmi, 0 }, + { X86::VPROTBrr, X86::VPROTBmr, 0 }, + { X86::VPROTDri, X86::VPROTDmi, 0 }, + { X86::VPROTDrr, X86::VPROTDmr, 0 }, + { X86::VPROTQri, X86::VPROTQmi, 0 }, + { X86::VPROTQrr, X86::VPROTQmr, 0 }, + { X86::VPROTWri, X86::VPROTWmi, 0 }, + { X86::VPROTWrr, X86::VPROTWmr, 0 }, + { X86::VPSHABrr, X86::VPSHABmr, 0 }, + { X86::VPSHADrr, X86::VPSHADmr, 0 }, + { X86::VPSHAQrr, X86::VPSHAQmr, 0 }, + { X86::VPSHAWrr, X86::VPSHAWmr, 0 }, + { X86::VPSHLBrr, X86::VPSHLBmr, 0 }, + { X86::VPSHLDrr, X86::VPSHLDmr, 0 }, + { X86::VPSHLQrr, X86::VPSHLQmr, 0 }, + { X86::VPSHLWrr, X86::VPSHLWmr, 0 }, + // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, @@ -661,6 +817,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, @@ -674,6 +831,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, @@ -687,24 +845,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + // F16C foldable instructions + { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, + { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, + // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, - { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 } + { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { - unsigned RegOp = OpTbl1[i].RegOp; - unsigned MemOp = OpTbl1[i].MemOp; - unsigned Flags = OpTbl1[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) { + unsigned RegOp = MemoryFoldTable1[i].RegOp; + unsigned MemOp = MemoryFoldTable1[i].MemOp; + unsigned Flags = MemoryFoldTable1[i].Flags; AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, RegOp, MemOp, // Index 1, folded load Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } - static const X86OpTblEntry OpTbl2[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, @@ -717,7 +879,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, @@ -784,10 +948,21 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, { X86::CMPSDrr, X86::CMPSDrm, 0 }, { X86::CMPSSrr, X86::CMPSSrm, 0 }, + { X86::CRC32r32r32, X86::CRC32r32m32, 0 }, + { X86::CRC32r64r64, X86::CRC32r64m64, 0 }, { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, + { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, + + // FIXME: We should not be folding Fs* scalar loads into vector + // instructions because the vector instructions require vector-sized + // loads. Lowering should create vector-sized instructions (the Fv* + // variants below) to allow load folding. { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, @@ -796,6 +971,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + + { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, + { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, + { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, + { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, + { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, + { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, + { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, + { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, @@ -814,16 +998,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, @@ -847,7 +1037,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 }, { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, + { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 }, { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, @@ -862,7 +1054,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, - { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, + { X86::PINSRBrr, X86::PINSRBrm, 0 }, + { X86::PINSRDrr, X86::PINSRDrm, 0 }, + { X86::PINSRQrr, X86::PINSRQrm, 0 }, + { X86::PINSRWrri, X86::PINSRWrmi, 0 }, { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, @@ -900,8 +1095,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, + { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 }, { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, + { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 }, + { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 }, { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, @@ -923,7 +1121,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, @@ -935,6 +1135,98 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR8rr, X86::XOR8rm, 0 }, { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, + { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, + { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, + { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, + { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, + { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, + { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, + { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, + { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, + { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, + { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, + { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, + { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 }, + { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, + { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, + { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, + { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, + { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, + { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, + { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, + { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, + { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, + { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, + { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 }, + { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 }, + { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 }, + { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 }, + { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 }, + { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 }, + { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 }, + { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 }, + { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, + { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, + { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, + { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, + { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, + { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 }, + { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, + { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, + { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, + { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, + { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, + { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, + { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 }, + { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 }, + { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 }, + { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 }, + { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 }, + { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 }, + { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 }, + { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 }, + { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 }, + { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, + { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, + { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, + { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, + { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, + { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, + { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, + { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, + { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, + { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, + { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, + { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, + { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, + { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, + { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 }, + { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 }, + { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 }, + { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, + + // 3DNow! version of foldable instructions + { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 }, + { X86::PFACCrr, X86::PFACCrm, 0 }, + { X86::PFADDrr, X86::PFADDrm, 0 }, + { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 }, + { X86::PFCMPGErr, X86::PFCMPGErm, 0 }, + { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 }, + { X86::PFMAXrr, X86::PFMAXrm, 0 }, + { X86::PFMINrr, X86::PFMINrm, 0 }, + { X86::PFMULrr, X86::PFMULrm, 0 }, + { X86::PFNACCrr, X86::PFNACCrm, 0 }, + { X86::PFPNACCrr, X86::PFPNACCrm, 0 }, + { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 }, + { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 }, + { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 }, + { X86::PFSUBrr, X86::PFSUBrm, 0 }, + { X86::PFSUBRrr, X86::PFSUBRrm, 0 }, + { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, + // AVX 128-bit versions of foldable instructions { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, @@ -948,13 +1240,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, + { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, @@ -972,15 +1271,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, - { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, - { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, - { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, - { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 }, - { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 }, - { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, - { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, - { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDPPDrri, X86::VDPPDrmi, 0 }, + { X86::VDPPSrri, X86::VDPPSrmi, 0 }, + // Do not fold VFs* loads because there are no scalar load variants for + // these instructions. When folded, the load is required to be 128-bits, so + // the load size would not match. + { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, + { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, + { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, + { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, + { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, + { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, + { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, + { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, @@ -990,16 +1296,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, @@ -1019,7 +1331,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDrr, X86::VPANDrm, 0 }, { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, + { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, + { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 }, { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, @@ -1036,6 +1350,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, + { X86::VPINSRBrr, X86::VPINSRBrm, 0 }, + { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, + { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, @@ -1074,8 +1391,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, + { X86::VPSUBQrr, X86::VPSUBQrm, 0 }, { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, + { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 }, + { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 }, { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, @@ -1091,13 +1411,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, { X86::VXORPDrr, X86::VXORPDrm, 0 }, { X86::VXORPSrr, X86::VXORPSrm, 0 }, + // AVX 256-bit foldable instructions { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, @@ -1115,6 +1438,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, @@ -1141,6 +1465,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, + // AVX2 foldable instructions { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, @@ -1162,6 +1487,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, + { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 }, { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, @@ -1173,9 +1499,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, - { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, - { X86::VPERMQYri, X86::VPERMQYmi, 0 }, { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, @@ -1230,8 +1554,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, + { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 }, { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, + { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 }, + { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 }, { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, @@ -1242,41 +1569,81 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, { X86::VPXORYrr, X86::VPXORYrm, 0 }, - // FIXME: add AVX 256-bit foldable instructions // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, - { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, - { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVmr, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVmrY, 0 }, + { X86::VPCOMBri, X86::VPCOMBmi, 0 }, + { X86::VPCOMDri, X86::VPCOMDmi, 0 }, + { X86::VPCOMQri, X86::VPCOMQmi, 0 }, + { X86::VPCOMWri, X86::VPCOMWmi, 0 }, + { X86::VPCOMUBri, X86::VPCOMUBmi, 0 }, + { X86::VPCOMUDri, X86::VPCOMUDmi, 0 }, + { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, + { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 }, + { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, + { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, + { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, + { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 }, + { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 }, + { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 }, + { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 }, + { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 }, + { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 }, + { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, + { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, + { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, + { X86::VPPERMrr, X86::VPPERMmr, 0 }, + { X86::VPROTBrr, X86::VPROTBrm, 0 }, + { X86::VPROTDrr, X86::VPROTDrm, 0 }, + { X86::VPROTQrr, X86::VPROTQrm, 0 }, + { X86::VPROTWrr, X86::VPROTWrm, 0 }, + { X86::VPSHABrr, X86::VPSHABrm, 0 }, + { X86::VPSHADrr, X86::VPSHADrm, 0 }, + { X86::VPSHAQrr, X86::VPSHAQrm, 0 }, + { X86::VPSHAWrr, X86::VPSHAWrm, 0 }, + { X86::VPSHLBrr, X86::VPSHLBrm, 0 }, + { X86::VPSHLDrr, X86::VPSHLDrm, 0 }, + { X86::VPSHLQrr, X86::VPSHLQrm, 0 }, + { X86::VPSHLWrr, X86::VPSHLWrm, 0 }, // BMI/BMI2 foldable instructions { X86::ANDN32rr, X86::ANDN32rm, 0 }, @@ -1345,10 +1712,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, - { X86::VAESDECLASTrr, X86::VAESDECLASTrm, TB_ALIGN_16 }, - { X86::VAESDECrr, X86::VAESDECrm, TB_ALIGN_16 }, - { X86::VAESENCLASTrr, X86::VAESENCLASTrm, TB_ALIGN_16 }, - { X86::VAESENCrr, X86::VAESENCrm, TB_ALIGN_16 }, + { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 }, + { X86::VAESDECrr, X86::VAESDECrm, 0 }, + { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 }, + { X86::VAESENCrr, X86::VAESENCrm, 0 }, // SHA foldable instructions { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, @@ -1357,20 +1724,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, - { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }, + { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { - unsigned RegOp = OpTbl2[i].RegOp; - unsigned MemOp = OpTbl2[i].MemOp; - unsigned Flags = OpTbl2[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) { + unsigned RegOp = MemoryFoldTable2[i].RegOp; + unsigned MemOp = MemoryFoldTable2[i].MemOp; + unsigned Flags = MemoryFoldTable2[i].Flags; AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, RegOp, MemOp, // Index 2, folded load Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } - static const X86OpTblEntry OpTbl3[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, @@ -1511,6 +1878,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVrm, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVrmY, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, + { X86::VPPERMrr, X86::VPPERMrm, 0 }, + // AVX-512 VPERMI instructions with 3 source operands. { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, @@ -1566,17 +1943,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { - unsigned RegOp = OpTbl3[i].RegOp; - unsigned MemOp = OpTbl3[i].MemOp; - unsigned Flags = OpTbl3[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) { + unsigned RegOp = MemoryFoldTable3[i].RegOp; + unsigned MemOp = MemoryFoldTable3[i].MemOp; + unsigned Flags = MemoryFoldTable3[i].Flags; AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, RegOp, MemOp, // Index 3, folded load Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } - static const X86OpTblEntry OpTbl4[] = { + static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { // AVX-512 foldable instructions { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, @@ -1618,10 +1995,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } }; - for (unsigned i = 0, e = array_lengthof(OpTbl4); i != e; ++i) { - unsigned RegOp = OpTbl4[i].RegOp; - unsigned MemOp = OpTbl4[i].MemOp; - unsigned Flags = OpTbl4[i].Flags; + for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) { + unsigned RegOp = MemoryFoldTable4[i].RegOp; + unsigned MemOp = MemoryFoldTable4[i].MemOp; + unsigned Flags = MemoryFoldTable4[i].Flags; AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, RegOp, MemOp, // Index 4, folded load @@ -1699,7 +2076,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { if (MI->getOpcode() == getCallFrameSetupOpcode() || MI->getOpcode() == getCallFrameDestroyOpcode()) { unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * + int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign; SPAdj -= MI->getOperand(1).getImm(); @@ -1709,8 +2086,8 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { else return -SPAdj; } - - // To know whether a call adjusts the stack, we need information + + // To know whether a call adjusts the stack, we need information // that is bound to the following ADJCALLSTACKUP pseudo. // Look for the next ADJCALLSTACKUP that follows the call. if (MI->isCall()) { @@ -1733,7 +2110,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { // Currently handle only PUSHes we can reasonably expect to see // in call sequences switch (MI->getOpcode()) { - default: + default: return 0; case X86::PUSH32i8: case X86::PUSH32r: @@ -1744,7 +2121,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { } } -/// isFrameOperand - Return true and the FrameIndex if the specified +/// Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const { @@ -1871,8 +2248,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, return 0; } -/// regIsPICBase - Return true if register is PIC base (i.e.g defined by -/// X86::MOVPC32r. +/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) @@ -2068,8 +2444,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); } -/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that -/// is not marked dead. +/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. static bool hasLiveCondCodeDef(MachineInstr *MI) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); @@ -2081,8 +2456,7 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) { return false; } -/// getTruncatedShiftCount - check whether the shift count for a machine operand -/// is non-zero. +/// Check whether the shift count for a machine operand is non-zero. inline static unsigned getTruncatedShiftCount(MachineInstr *MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. @@ -2091,7 +2465,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, return Imm & ShiftCountMask; } -/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate +/// Check whether the given shift count is appropriate /// can be represented by a LEA instruction. inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address @@ -2173,10 +2547,9 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, return true; } -/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when -/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting -/// to a 32-bit superregister and then truncating back down to a 16-bit -/// subregister. +/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit +/// LEA to form 3-address code by promoting to a 32-bit superregister and then +/// truncating back down to a 16-bit subregister. MachineInstr * X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, @@ -2283,7 +2656,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, return ExtMI; } -/// convertToThreeAddress - This method must be implemented by targets that +/// This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true /// three-address instruction on demand. This allows the X86 target (for @@ -2558,8 +2931,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// commuteInstruction - We have a few instructions that must be hacked on to -/// commute them. +/// We have a few instructions that must be hacked on to commute them. /// MachineInstr * X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { @@ -2627,6 +2999,71 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI->getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstruction(MI, NewMI); } + case X86::PCLMULQDQrr: + case X86::VPCLMULQDQrr:{ + // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] + // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] + unsigned Imm = MI->getOperand(3).getImm(); + unsigned Src1Hi = Imm & 0x01; + unsigned Src2Hi = Imm & 0x10; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + return TargetInstrInfo::commuteInstruction(MI, NewMI); + default: + return nullptr; + } + } + case X86::VPCOMBri: case X86::VPCOMUBri: + case X86::VPCOMDri: case X86::VPCOMUDri: + case X86::VPCOMQri: case X86::VPCOMUQri: + case X86::VPCOMWri: case X86::VPCOMUWri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: Imm = 0x02; break; // LT -> GT + case 0x01: Imm = 0x03; break; // LE -> GE + case 0x02: Imm = 0x00; break; // GT -> LT + case 0x03: Imm = 0x01; break; // GE -> LE + case 0x04: // EQ + case 0x05: // NE + case 0x06: // FALSE + case 0x07: // TRUE + default: + break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -2711,20 +3148,26 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { - case X86::BLENDPDrri: - case X86::BLENDPSrri: - case X86::PBLENDWrri: - case X86::VBLENDPDrri: - case X86::VBLENDPSrri: - case X86::VBLENDPDYrri: - case X86::VBLENDPSYrri: - case X86::VPBLENDDrri: - case X86::VPBLENDDYrri: - case X86::VPBLENDWrri: - case X86::VPBLENDWYrri: - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + SrcOpIdx1 = 1; + SrcOpIdx2 = 2; + return true; + } + return false; + } case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: @@ -2779,7 +3222,7 @@ static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { } } -/// getCondFromSETOpc - return condition code of a SET opcode. +/// Return condition code of a SET opcode. static X86::CondCode getCondFromSETOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; @@ -2802,7 +3245,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) { } } -/// getCondFromCmovOpc - return condition code of a CMov opcode. +/// Return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; @@ -2879,7 +3322,7 @@ unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { } } -/// GetOppositeBranchCondition - Return the inverse of the specified condition, +/// Return the inverse of the specified condition, /// e.g. turning COND_E to COND_NE. X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { switch (CC) { @@ -2903,9 +3346,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { } } -/// getSwappedCondition - assume the flags are set by MI(a,b), return -/// the condition code if we modify the instructions such that flags are -/// set by MI(b,a). +/// Assuming the flags are set by MI(a,b), return the condition code if we +/// modify the instructions such that flags are set by MI(b,a). static X86::CondCode getSwappedCondition(X86::CondCode CC) { switch (CC) { default: return X86::COND_INVALID; @@ -2922,7 +3364,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { } } -/// getSETFromCond - Return a set opcode for the given condition and +/// Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { static const uint16_t Opc[16][2] = { @@ -2948,7 +3390,7 @@ unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { return Opc[CC][HasMemoryOperand ? 1 : 0]; } -/// getCMovFromCond - Return a cmov opcode for the given condition, +/// Return a cmov opcode for the given condition, /// register size in bytes, and operand type. unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand) { @@ -3271,7 +3713,7 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } -/// isHReg - Test if the given register is a physical h register. +/// Test if the given register is a physical h register. static bool isHReg(unsigned Reg) { return X86::GR8_ABCD_HRegClass.contains(Reg); } @@ -3543,11 +3985,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) @@ -3582,11 +4022,9 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = (MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); @@ -3682,7 +4120,7 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, return false; } -/// isRedundantFlagInstr - check whether the first instruction, whose only +/// Check whether the first instruction, whose only /// purpose is to update flags, can be made redundant. /// CMPrr can be made redundant by SUBrr if the operands are the same. /// This function can be extended later on. @@ -3725,7 +4163,7 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, return false; } -/// isDefConvertible - check whether the definition can be converted +/// Check whether the definition can be converted /// to remove a comparison against zero. inline static bool isDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -3811,8 +4249,7 @@ inline static bool isDefConvertible(MachineInstr *MI) { } } -/// isUseDefConvertible - check whether the use can be converted -/// to remove a comparison against zero. +/// Check whether the use can be converted to remove a comparison against zero. static X86::CondCode isUseDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return X86::COND_INVALID; @@ -3831,7 +4268,7 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) { } } -/// optimizeCompareInstr - Check if there exists an earlier instruction that +/// Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. bool X86InstrInfo:: @@ -4122,7 +4559,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, return true; } -/// optimizeLoadInstr - Try to remove the load by folding it to a register +/// Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. @@ -4142,7 +4579,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); bool SawStore = false; - if (!DefMI->isSafeToMove(this, nullptr, SawStore)) + if (!DefMI->isSafeToMove(nullptr, SawStore)) return nullptr; // Collect information about virtual register operands of MI. @@ -4166,9 +4603,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - SmallVector<unsigned, 8> Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); if (FoldMI) { FoldAsLoadDefReg = 0; return FoldMI; @@ -4177,9 +4612,9 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; } -/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr -/// instruction with two undef reads of the register being defined. This is -/// used for mapping: +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. +/// This is used for mapping: /// %xmm4 = V_SET0 /// to: /// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> @@ -4263,7 +4698,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. @@ -4290,9 +4725,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, return MIB; } -static MachineInstr *FuseInst(MachineFunction &MF, - unsigned Opcode, unsigned OpNo, - const SmallVectorImpl<MachineOperand> &MOs, +static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, + unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), @@ -4316,7 +4750,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, } static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, MachineInstr *MI) { MachineFunction &MF = *MI->getParent()->getParent(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); @@ -4329,23 +4763,22 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr *MI, unsigned i, - const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Align, - bool AllowCommute) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // Atom favors register form of call. So, we do not fold loads into calls - // when X86Subtarget is Atom. + // For CPUs that favor the register form of a call, + // do not fold loads into calls. if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) return nullptr; - } unsigned NumOps = MI->getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && @@ -4361,13 +4794,13 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. - if (isTwoAddr && NumOps >= 2 && i < 2 && + if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI->getOperand(0).isReg() && MI->getOperand(1).isReg() && MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; - } else if (i == 0) { // If operand 0 + } else if (OpNum == 0) { if (MI->getOpcode() == X86::MOV32r0) { NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); if (NewMI) @@ -4375,13 +4808,13 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (i == 1) { + } else if (OpNum == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (i == 2) { + } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (i == 3) { + } else if (OpNum == 3) { OpcodeTablePtr = &RegOp2MemOpTable3; - } else if (i == 4) { + } else if (OpNum == 4) { OpcodeTablePtr = &RegOp2MemOpTable4; } @@ -4397,7 +4830,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return nullptr; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -4416,7 +4849,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (isTwoAddrFold) NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this); else - NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this); + NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this); if (NarrowToMOV32rm) { // If this is the special case where we use a MOV32rm to load a 32-bit @@ -4435,7 +4868,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; @@ -4493,11 +4926,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // No fusion if (PrintFailedFusing && !MI->isCopy()) - dbgs() << "We failed to fuse operand " << i << " in " << *MI; + dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI; return nullptr; } -/// hasPartialRegUpdate - Return true for all instructions that only update +/// Return true for all instructions that only update /// the first 32 or 64-bits of the destination register and leave the rest /// unmodified. This can be used to avoid folding loads if the instructions /// only update part of the destination register, and the non-updated part is @@ -4559,7 +4992,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) { return false; } -/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle +/// Inform the ExeDepsFix pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo:: getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, @@ -4698,17 +5131,16 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, MI->addRegisterKilled(Reg, TRI, true); } -MachineInstr* -X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, - int FrameIndex) const { +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + ArrayRef<unsigned> Ops, + int FrameIndex) const { // Check switch flag if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; @@ -4718,10 +5150,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min(Alignment, MF.getTarget() - .getSubtargetImpl() - ->getFrameLowering() - ->getStackAlignment()); + Alignment = + std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4742,10 +5172,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, } else if (Ops.size() != 1) return nullptr; - SmallVector<MachineOperand,4> MOs; - MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, - Size, Alignment, /*AllowCommute=*/true); + return foldMemoryOperandImpl(MF, MI, Ops[0], + MachineOperand::CreateFI(FrameIndex), Size, + Alignment, /*AllowCommute=*/true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, @@ -4767,9 +5196,9 @@ static bool isPartialRegisterLoad(const MachineInstr &LoadMI, return false; } -MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, +MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops, + ArrayRef<unsigned> Ops, MachineInstr *LoadMI) const { // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); @@ -4785,8 +5214,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && + if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; @@ -4893,8 +5321,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return nullptr; // Folding a normal load. Just copy the load's address operands. - for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) - MOs.push_back(LoadMI->getOperand(i)); + MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands, + LoadMI->operands_begin() + NumOps); break; } } @@ -4902,9 +5330,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, /*Size=*/0, Alignment, /*AllowCommute=*/true); } - bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - const SmallVectorImpl<unsigned> &Ops) const { + ArrayRef<unsigned> Ops) const { // Check switch flag if (NoFusing) return 0; @@ -4941,7 +5368,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { // If operand 0 + } else if (OpNum == 0) { if (Opc == X86::MOV32r0) return true; @@ -5157,7 +5584,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, } if (Load) BeforeOps.push_back(SDValue(Load, 0)); - std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); + BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end()); SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); @@ -5184,7 +5611,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, NewNodes.push_back(Store); // Preserve memory reference information. - cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second); } return true; @@ -5539,7 +5966,7 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); } -/// getGlobalBaseReg - Return a virtual register initialized with the +/// Return a virtual register initialized with the /// the global base register value. Output instructions required to /// initialize the register in the function entry block, if necessary. /// @@ -5572,6 +5999,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -5587,6 +6015,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, @@ -5672,7 +6101,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { MI->setDesc(get(table[Domain-1])); } -/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +/// Return the noop instruction to use for a noop. void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } @@ -5684,7 +6113,7 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { void X86InstrInfo::getUnconditionalBranch( MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { Branch.setOpcode(X86::JMP_1); - Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); + Branch.addOperand(MCOperand::createExpr(BranchTarget)); } // This code must remain in sync with getJumpInstrTableEntryBound in this class! @@ -5789,7 +6218,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, } namespace { - /// CGBR - Create Global Base Reg pass. This initializes the PIC + /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. struct CGBR : public MachineFunctionPass { static char ID; @@ -5798,10 +6227,11 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF.getTarget()); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); // Don't do anything if this is 64-bit as 64-bit PIC // uses RIP relative addressing. - if (TM->getSubtarget<X86Subtarget>().is64Bit()) + if (STI.is64Bit()) return false; // Only emit a global base reg in PIC mode. @@ -5820,10 +6250,10 @@ namespace { MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86InstrInfo *TII = STI.getInstrInfo(); unsigned PC; - if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) + if (STI.isPICStyleGOT()) PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else PC = GlobalBaseReg; @@ -5834,7 +6264,7 @@ namespace { // If we're using vanilla 'GOT' PIC style, we should use relative addressing // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. - if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) { + if (STI.isPICStyleGOT()) { // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", @@ -5915,10 +6345,9 @@ namespace { MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, unsigned TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const X86TargetMachine *TM = - static_cast<const X86TargetMachine *>(&MF->getTarget()); - const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), @@ -5936,10 +6365,9 @@ namespace { // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); - const X86TargetMachine *TM = - static_cast<const X86TargetMachine *>(&MF->getTarget()); - const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 4d15467..0dd8101 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -305,23 +305,21 @@ public: /// folding and return true, otherwise it should return false. If it folds /// the instruction, it is likely that the MachineInstruction the iterator /// references has been changed. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, int FrameIndex) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific /// stack slot. - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, - const SmallVectorImpl<unsigned> &Ops, - MachineInstr* LoadMI) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineInstr *LoadMI) const override; /// canFoldMemoryOperand - Returns true if the specified load / store is /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr*, - const SmallVectorImpl<unsigned> &) const override; + bool canFoldMemoryOperand(const MachineInstr *, + ArrayRef<unsigned>) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is @@ -406,10 +404,9 @@ public: void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; - MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, - MachineInstr* MI, + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned OpNum, - const SmallVectorImpl<MachineOperand> &MOs, + ArrayRef<MachineOperand> MOs, unsigned Size, unsigned Alignment, bool AllowCommute) const; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 53715dc..70c2027520 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -383,25 +383,19 @@ def brtarget8 : Operand<OtherVT>; } -// Special parsers to detect mode to disambiguate. +// Special parser to detect 16-bit mode to select 16-bit displacement. def X86AbsMem16AsmOperand : AsmOperandClass { let Name = "AbsMem16"; let RenderMethod = "addAbsMemOperands"; let SuperClasses = [X86AbsMemAsmOperand]; } -def X86AbsMem32AsmOperand : AsmOperandClass { - let Name = "AbsMem32"; - let RenderMethod = "addAbsMemOperands"; - let SuperClasses = [X86AbsMemAsmOperand]; -} - // Branch targets have OtherVT type and print as pc-relative values. let OperandType = "OPERAND_PCREL", PrintMethod = "printPCRelImm" in { let ParserMatchClass = X86AbsMem16AsmOperand in def brtarget16 : Operand<OtherVT>; -let ParserMatchClass = X86AbsMem32AsmOperand in +let ParserMatchClass = X86AbsMemAsmOperand in def brtarget32 : Operand<OtherVT>; } @@ -539,7 +533,7 @@ def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", X86MemOffs64_64AsmOperand>; def SSECC : Operand<i8> { - let PrintMethod = "printSSECC"; + let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } @@ -548,17 +542,23 @@ def i8immZExt3 : ImmLeaf<i8, [{ }]>; def AVXCC : Operand<i8> { - let PrintMethod = "printAVXCC"; + let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } def i8immZExt5 : ImmLeaf<i8, [{ return Imm >= 0 && Imm < 32; }]>; -// AVX-512 uses a 32-bit immediate in their intrinsics -def i32immZExt5 : ImmLeaf<i32, [{ - return Imm >= 0 && Imm < 32; -}]>; + +def AVX512ICC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def XOPCC : Operand<i8> { + let PrintMethod = "printXOPCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} class ImmSExtAsmOperandClass : AsmOperandClass { let SuperClasses = [ImmAsmOperand]; @@ -572,10 +572,13 @@ def X86GR32orGR64AsmOperand : AsmOperandClass { def GR32orGR64 : RegisterOperand<GR32> { let ParserMatchClass = X86GR32orGR64AsmOperand; } - +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; } // Sign-extended immediate classes. We don't need to define the full lattice @@ -613,6 +616,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { ImmSExti64i32AsmOperand]; } +// Unsigned immediate used by SSE/AVX instructions +// [0, 0xFF] +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmUnsignedi8AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi8"; + let RenderMethod = "addImmOperands"; +} + // A couple of more descriptive operand definitions. // 16-bits but only 8 bits are significant. def i16i8imm : Operand<i16> { @@ -631,6 +642,27 @@ def i64i32imm : Operand<i64> { let OperandType = "OPERAND_IMMEDIATE"; } +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Unsigned 8-bit immediate used by SSE/AVX instructions. +def u8imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 32-bit immediate but only 8-bits are significant and they are unsigned. +// Used by some SSE/AVX instructions that use intrinsics. +def i32u8imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // 64-bits but only 32 bits are significant, and those bits are treated as being // pc relative. def i64i32imm_pcrel : Operand<i64> { @@ -639,12 +671,6 @@ def i64i32imm_pcrel : Operand<i64> { let OperandType = "OPERAND_PCREL"; } -// 64-bits but only 8 bits are significant. -def i64i8imm : Operand<i64> { - let ParserMatchClass = ImmSExti64i8AsmOperand; - let OperandType = "OPERAND_IMMEDIATE"; -} - def lea64_32mem : Operand<i32> { let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); @@ -690,6 +716,8 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", [tglobaltlsaddr], []>; +def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; + //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. def HasCMov : Predicate<"Subtarget->hasCMov()">; @@ -720,14 +748,19 @@ def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; -def HasCDI : Predicate<"Subtarget->hasCDI()">; -def HasPFI : Predicate<"Subtarget->hasPFI()">; -def HasERI : Predicate<"Subtarget->hasERI()">; -def HasDQI : Predicate<"Subtarget->hasDQI()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">, + AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasPFI : Predicate<"Subtarget->hasPFI()">, + AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; +def HasERI : Predicate<"Subtarget->hasERI()">, + AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">; +def HasDQI : Predicate<"Subtarget->hasDQI()">, + AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; -def HasBWI : Predicate<"Subtarget->hasBWI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">, + AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; def HasVLX : Predicate<"Subtarget->hasVLX()">, - AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; + AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; @@ -750,10 +783,8 @@ def HasHLE : Predicate<"Subtarget->hasHLE()">; def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; -def HasSGX : Predicate<"Subtarget->hasSGX()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; -def HasSMAP : Predicate<"Subtarget->hasSMAP()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; @@ -771,6 +802,9 @@ def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; +def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; @@ -823,11 +857,11 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ return (Imm == X86::COND_E) || (Imm == X86::COND_NE); }]>; -let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. - def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; - def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; - def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; -} + +def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; @@ -2191,11 +2225,11 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { def CountTrailingOnes : SDNodeXForm<imm, [{ // Count the trailing ones in the immediate. - return getI8Imm(CountTrailingOnes_64(N->getZExtValue())); + return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N)); }]>; def BZHIMask : ImmLeaf<i64, [{ - return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32); + return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32); }]>; let Predicates = [HasBMI2] in { @@ -2385,6 +2419,16 @@ let Predicates = [HasTBM] in { } // HasTBM //===----------------------------------------------------------------------===// +// Memory Instructions +// + +def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflushopt\t$src", []>, PD; +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD; +def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD; + + +//===----------------------------------------------------------------------===// // Subsystems. //===----------------------------------------------------------------------===// @@ -2537,6 +2581,12 @@ def : MnemonicAlias<"fnstsww", "fnstsw", "att">; def : MnemonicAlias<"fucomip", "fucompi", "att">; def : MnemonicAlias<"fwait", "wait">; +def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; +def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; +def : MnemonicAlias<"xsaveq", "xsave64", "att">; +def : MnemonicAlias<"xrstorq", "xrstor64", "att">; +def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; + class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, string VariantName> diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index 5b36427..eaa7894 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -125,9 +125,9 @@ let Constraints = "$src1 = $dst" in { (bitconvert (load_mmx addr:$src2))))], itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), - (ins VR64:$src1, i32i8imm:$src2), + (ins VR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>, + [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>, Sched<[WriteVecShift]>; } } @@ -170,12 +170,12 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, /// PALIGN MMX instructions (require SSSE3). multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), - (ins VR64:$src1, VR64:$src2, i8imm:$src3), + (ins VR64:$src1, VR64:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, Sched<[WriteShuffle]>; def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), - (ins VR64:$src1, i64mem:$src2, i8imm:$src3), + (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, @@ -223,20 +223,26 @@ def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), [(set VR64:$dst, (x86mmx (scalar_to_vector GR32:$src)))], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; -let canFoldAsLoad = 1 in def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (scalar_to_vector (loadi32 addr:$src))))], IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; + +let Predicates = [HasMMX] in { + let AddedComplexity = 15 in + def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), + (MMX_MOVD64rr GR32:$src)>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), + (MMX_MOVD64rm addr:$src)>; +} + let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>, Sched<[WriteStore]>; -// Low word of MMX to GPR. -def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, - [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, @@ -248,6 +254,11 @@ def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), [(set VR64:$dst, (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>; + // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. @@ -268,6 +279,12 @@ def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), } } // SchedRW +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem, + (outs i64mem:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>; + let SchedRW = [WriteLoad] in { let canFoldAsLoad = 1 in def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), @@ -453,6 +470,13 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLQrm VR64:$src1, addr:$src2)>; + defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_mmx_psll_w, int_x86_mmx_pslli_w, MMX_SHIFT_ITINS>; @@ -463,6 +487,13 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_mmx_psll_q, int_x86_mmx_pslli_q, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLQrm VR64:$src1, addr:$src2)>; + defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_mmx_psra_w, int_x86_mmx_psrai_w, MMX_SHIFT_ITINS>; @@ -470,6 +501,11 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d, MMX_SHIFT_ITINS>; +def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRAWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRADrm VR64:$src1, addr:$src2)>; + // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, MMX_INTALU_ITINS>; @@ -518,13 +554,13 @@ defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, MMX_PSHUF_ITINS>; def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, - (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2), + (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], IIC_MMX_PSHUF>, Sched<[WriteShuffle]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, - (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), + (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), @@ -559,27 +595,27 @@ let Constraints = "$src1 = $dst" in { // Extract / Insert def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, - (iPTR imm:$src2)))], + imm:$src2))], IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), - (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3), + (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32orGR64:$src2, (iPTR imm:$src3)))], + GR32orGR64:$src2, imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), - (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3), + (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), - (iPTR imm:$src3)))], + imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td index 47c5dc5..84119ad 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSGX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td @@ -17,8 +17,8 @@ // ENCLS - Execute an Enclave System Function of Specified Leaf Number def ENCLS : I<0x01, MRM_CF, (outs), (ins), - "encls", []>, TB, Requires<[HasSGX]>; + "encls", []>, TB; // ENCLU - Execute an Enclave User Function of Specified Leaf Number def ENCLU : I<0x01, MRM_D7, (outs), (ins), - "enclu", []>, TB, Requires<[HasSGX]>; + "enclu", []>, TB; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index fca0c44..d3b401e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -241,21 +241,20 @@ def SSE_INTALU_ITINS_BLEND_P : OpndItins< /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, - OpndItins itins, - bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { let isCommutable = 1 in { def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>, + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>, Sched<[itins.Sched]>; } def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>, + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -263,24 +262,23 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, string asm, string SSEVer, string FPSizeStr, Operand memopr, ComplexPattern mem_cpat, - OpndItins itins, - bit Is2Addr = 1> { + Domain d, OpndItins itins, bit Is2Addr = 1> { let isCodeGenOnly = 1 in { - def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>( !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, RC:$src2))], itins.rr>, + RC:$src1, RC:$src2))], itins.rr, d>, Sched<[itins.Sched]>; - def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), + def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) - RC:$src1, mem_cpat:$src2))], itins.rm>, + RC:$src1, mem_cpat:$src2))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -548,13 +546,13 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string base_opc, - string asm_opr> { + string asm_opr, Domain d = GenericDomain> { def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), [(set VR128:$dst, (vt (OpNode VR128:$src1, (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; + IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>; // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in @@ -565,49 +563,55 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, } multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, - X86MemOperand x86memop, string OpcodeStr> { + X86MemOperand x86memop, string OpcodeStr, + Domain d = GenericDomain> { // AVX defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, VEX_4V, VEX_LIG; def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, VEX, VEX_LIG, Sched<[WriteStore]>; // SSE1 & 2 let Constraints = "$src1 = $dst" in { defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, - "\t{$src2, $dst|$dst, $src2}">; + "\t{$src2, $dst|$dst, $src2}", d>; } def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, Sched<[WriteStore]>; } // Loading from memory automatically zeroing upper bits. multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_pat, string OpcodeStr> { + PatFrag mem_pat, string OpcodeStr, + Domain d = GenericDomain> { def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], - IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>; + IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>; def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], - IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>; + IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>; } -defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS; -defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD; +defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", + SSEPackedSingle>, XS; +defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", + SSEPackedDouble>, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { - defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS; + defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", + SSEPackedSingle>, XS; let AddedComplexity = 20 in - defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD; + defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", + SSEPackedDouble>, XD; } // Patterns @@ -637,9 +641,6 @@ let Predicates = [UseAVX] in { // Represent the same patterns above but in the form they appear for // 256-bit types - def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, - (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; @@ -647,9 +648,6 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -787,7 +785,7 @@ let Predicates = [UseSSE2] in { (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem - // is during lowering, where it's not possible to recognize the fold cause + // is during lowering, where it's not possible to recognize the fold because // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), @@ -1333,7 +1331,7 @@ let Predicates = [HasAVX] in { (VMOVHPSrm VR128:$src1, addr:$src2)>; // VMOVHPD patterns - + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1853,14 +1851,14 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>, + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>, + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { @@ -1938,14 +1936,14 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>, + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>, + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, @@ -2348,11 +2346,11 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [], + (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; let mayLoad = 1 in def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [], + (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], IIC_SSE_ALU_F32S_RM>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -2487,7 +2485,8 @@ let Defs = [EFLAGS] in { multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Operand CC, Intrinsic Int, string asm, string asm_alt, Domain d, ImmLeaf immLeaf, - OpndItins itins = SSE_ALU_F32P> { + PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { + let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], @@ -2495,17 +2494,18 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), immLeaf:$cc))], + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : PIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; + let mayLoad = 1 in def rmi_alt : PIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; } @@ -2514,61 +2514,61 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle, i8immZExt5>, PS, VEX_4V; + SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V; defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble, i8immZExt5>, PD, VEX_4V; + SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V; defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle, i8immZExt5>, PS, VEX_4V, VEX_L; + SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble, i8immZExt5>, PD, VEX_4V, VEX_L; + SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle, i8immZExt5, SSE_ALU_F32P>, PS; + SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble, i8immZExt5, SSE_ALU_F64P>, PD; + SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; } let Predicates = [HasAVX] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; -def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; -def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; } let Predicates = [UseSSE1] in { def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; } let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; } @@ -2581,12 +2581,12 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, + (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffle]>; @@ -2742,24 +2742,6 @@ let Predicates = [HasAVX1Only] in { (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; } -let Predicates = [HasAVX] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (VUNPCKLPDrr VR128:$src, VR128:$src)>; -} - -let Predicates = [UseSSE2] in { - // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the - // problem is during lowering, where it's not possible to recognize the load - // fold cause it has two uses through a bitcast. One use disappears at isel - // time and the fold opportunity reappears. - def : Pat<(v2f64 (X86Movddup VR128:$src)), - (UNPCKLPDrr VR128:$src, VR128:$src)>; -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Extract Floating-Point Sign mask //===----------------------------------------------------------------------===// @@ -2880,40 +2862,73 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, // SSE 1 & 2 - Logical Instructions //===----------------------------------------------------------------------===// -/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops -/// -multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, - SDNode OpNode, OpndItins itins> { +// Multiclass for scalars using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_scalar_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, + PS, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; + } +} + +let isCodeGenOnly = 1 in { + defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; + defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + + let isCommutable = 0 in + defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, + SSE_BIT_ITINS_P>; +} + +// Multiclass for vectors using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_vector_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>, + VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>, + VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, PD, VEX_4V; + } let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, - f32, f128mem, memopfsf32, SSEPackedSingle, itins>, + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>, PS; - defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, - f64, f128mem, memopfsf64, SSEPackedDouble, itins>, + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>, PD; } } -// Alias bitwise logical operations using SSE logical ops on packed FP values. let isCodeGenOnly = 1 in { - defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand, + defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand, SSE_BIT_ITINS_P>; - defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for, + defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for, SSE_BIT_ITINS_P>; - defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor, + defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor, SSE_BIT_ITINS_P>; let isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn, + defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn, SSE_BIT_ITINS_P>; } @@ -3037,15 +3052,19 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), - OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG; + OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>, + XS, VEX_4V, VEX_LIG; defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), - OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG; + OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>, + XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), - OpNode, FR32, f32mem, itins.s>, XS; + OpNode, FR32, f32mem, SSEPackedSingle, + itins.s>, XS; defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), - OpNode, FR64, f64mem, itins.d>, XD; + OpNode, FR64, f64mem, SSEPackedDouble, + itins.d>, XD; } } @@ -3053,18 +3072,18 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, SizeItins itins> { defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, - itins.s, 0>, XS, VEX_4V, VEX_LIG; + SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, - itins.d, 0>, XD, VEX_4V, VEX_LIG; + SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, - itins.s>, XS; + SSEPackedSingle, itins.s>, XS; defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, - itins.d>, XD; + SSEPackedDouble, itins.d>, XD; } } @@ -3098,10 +3117,9 @@ let isCodeGenOnly = 1 in { } // Patterns used to select SSE scalar fp arithmetic instructions from -// a scalar fp operation followed by a blend. +// either: // -// These patterns know, for example, how to select an ADDSS from a -// float add plus vector insert. +// (1) a scalar fp operation followed by a blend // // The effect is that the backend no longer emits unnecessary vector // insert instructions immediately after SSE scalar fp instructions @@ -3113,218 +3131,14 @@ let isCodeGenOnly = 1 in { // return A; // } // -// previously we generated: +// Previously we generated: // addss %xmm0, %xmm1 // movss %xmm1, %xmm0 // -// we now generate: +// We now generate: // addss %xmm1, %xmm0 - -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; -} - -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is - // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When - // selecting SSE scalar single-precision fp arithmetic instructions, make - // sure that we correctly match them. - - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions. - - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (iPTR 0))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv - (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv - (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; -} - -// Patterns used to select SSE scalar fp arithmetic instructions from -// a vector packed single/double fp operation followed by a vector insert. +// +// (2) a vector packed single/double fp operation followed by a vector insert // // The effect is that the backend converts the packed fp instruction // followed by a vector insert into a single SSE scalar fp instruction. @@ -3335,160 +3149,137 @@ let Predicates = [HasAVX] in { // return (__m128) {c[0], a[1], a[2], a[3]}; // } // -// previously we generated: +// Previously we generated: // addps %xmm0, %xmm1 // movss %xmm1, %xmm0 // -// we now generate: +// We now generate: // addss %xmm1, %xmm0 -let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; -} +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE1] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } -let Predicates = [UseSSE2] in { - // SSE2 patterns to select scalar double-precision fp arithmetic instructions - // from a packed double-precision fp instruction plus movsd. - - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; -} + // With SSE 4.1, blendi is preferred to movsd, so match that too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; -let Predicates = [UseSSE41] in { - // With SSE4.1 we may see these operations using X86Blendi rather than - // X86Movs{s,d}. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (MULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; + + } + + // Repeat everything for AVX, except for the movss + scalar combo... + // because that one shouldn't occur with AVX codegen? + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } } -let Predicates = [HasAVX] in { - // The following patterns select AVX Scalar single/double precision fp - // arithmetic instructions from a packed single precision fp instruction - // plus movss/movsd. - - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - // Also handle X86Blendi-based patterns. - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; - - def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; - def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), - (v2f64 VR128:$dst), (i8 2))), - (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; +defm : scalar_math_f32_patterns<fadd, "ADD">; +defm : scalar_math_f32_patterns<fsub, "SUB">; +defm : scalar_math_f32_patterns<fmul, "MUL">; +defm : scalar_math_f32_patterns<fdiv, "DIV">; + +multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE2] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // With SSE 4.1, blendi is preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // Repeat everything for AVX. + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } } +defm : scalar_math_f64_patterns<fadd, "ADD">; +defm : scalar_math_f64_patterns<fsub, "SUB">; +defm : scalar_math_f64_patterns<fmul, "MUL">; +defm : scalar_math_f64_patterns<fdiv, "DIV">; + + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -3535,56 +3326,104 @@ def SSE_RCPS : OpndItins< >; } -/// sse1_fp_unop_s - SSE1 unops in scalar form +/// sse_fp_unop_s - SSE1 unops in scalar form /// For the non-AVX defs, we need $src1 to be tied to $dst because /// the HW instructions are 2 operand / destructive. -multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), - (ins FR32:$src1, FR32:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), - (ins FR32:$src1,f32mem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; +multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, Intrinsic Intr, + SDNode OpNode, Domain d, OpndItins itins, + Predicate target, string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>, + Requires<[target]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>, + Requires<[target, OptForSize]>; + + let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } + } + + let Predicates = [target] in { + def : Pat<(vt (OpNode mem_cpat:$src)), + (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>; + // These are unary operations, but they are modeled as having 2 source operands + // because the high elements of the destination are unchanged in SSE. + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; + def : Pat<(Intr (load addr:$src)), + (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) + addr:$src), VR128))>; + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; } } - def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; - // For scalar unary operations, fold a load into the operation - // only in OptForSize mode. It eliminates an instruction, but it also - // eliminates a whole-register clobber (the load), so it introduces a - // partial register update condition. - def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; - let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { - def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), - [], itins.rr>, Sched<[itins.Sched]>; - let mayLoad = 1, hasSideEffects = 0 in - def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"), - [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, + Intrinsic Intr, SDNode OpNode, Domain d, + OpndItins itins, string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[itins.Sched.Folded]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } + } + + let Predicates = [UseAVX] in { + def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) + (ScalarVT (IMPLICIT_DEF)), RC:$src)>; + + def : Pat<(vt (OpNode mem_cpat:$src)), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), + mem_cpat:$src)>; + + } + let Predicates = [HasAVX] in { + def : Pat<(Intr VR128:$src), + (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), + VR128:$src)>; + + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } + let Predicates = [UseAVX, OptForSize] in + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; } /// sse1_fp_unop_p - SSE1 unops in packed form. @@ -3623,93 +3462,6 @@ let Predicates = [HasAVX] in { Sched<[itins.Sched.Folded]>; } -/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. -multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, - Intrinsic V4F32Int, Intrinsic V8F32Int, - OpndItins itins> { -let isCodeGenOnly = 1 in { -let Predicates = [HasAVX] in { - def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat("v", OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int VR128:$src))], - itins.rr>, VEX, Sched<[itins.Sched]>; - def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - !strconcat("v", OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))], - itins.rm>, VEX, Sched<[itins.Sched.Folded]>; - def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - !strconcat("v", OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V8F32Int VR256:$src))], - itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; - def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), - (ins f256mem:$src), - !strconcat("v", OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))], - itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; -} - - def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int VR128:$src))], - itins.rr>, Sched<[itins.Sched]>; - def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))], - itins.rm>, Sched<[itins.Sched.Folded]>; -} // isCodeGenOnly = 1 -} - -/// sse2_fp_unop_s - SSE2 unops in scalar form. -multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F64Int, OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), - (ins FR64:$src1, FR64:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), - (ins FR64:$src1,f64mem:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, sdmem:$src2), - !strconcat("v", OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - } -} - - def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>, - Sched<[itins.Sched]>; - // See the comments in sse1_fp_unop_s for why this is OptForSize. - def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD, - Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), - !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>, - Sched<[itins.Sched.Folded]>; -} -} - /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { @@ -3746,92 +3498,83 @@ let Predicates = [HasAVX] in { Sched<[itins.Sched.Folded]>; } +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, + ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + SSEPackedSingle, itins, UseSSE1, "SS">, XS; + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, + f32mem, ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG; +} + +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, + sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, + f64mem, sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, SSEPackedDouble, itins, "SD">, + XD, VEX_4V, VEX_LIG; +} + // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTSD>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, - sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, - sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, - int_x86_avx_rcp_ps_256, SSE_RCPP>; + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; -let Predicates = [UseAVX] in { - def : Pat<(f32 (fsqrt FR32:$src)), - (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - def : Pat<(f64 (fsqrt FR64:$src)), - (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - - def : Pat<(f32 (X86frsqrt FR32:$src)), - (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; - - def : Pat<(f32 (X86frcp FR32:$src)), - (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[HasAVX, OptForSize]>; -} -let Predicates = [UseAVX] in { - def : Pat<(int_x86_sse_sqrt_ss VR128:$src), - (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; +// There is no f64 version of the reciprocal approximation instructions. - def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), - (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR64)), - VR128)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; -} +// TODO: We should add *scalar* op patterns for these just like we have for +// the binops above. If the binop and unop patterns could all be unified +// that would be even better. -let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), - (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), - (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse_rcp_ss VR128:$src), - (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128:$src, FR32)), - VR128)>; - def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), - (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; -} - -// These are unary operations, but they are modeled as having 2 source operands -// because the high elements of the destination are unchanged in SSE. -let Predicates = [UseSSE1] in { - def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), - (RSQRTSSr_Int VR128:$src, VR128:$src)>; - def : Pat<(int_x86_sse_rcp_ss VR128:$src), - (RCPSSr_Int VR128:$src, VR128:$src)>; - def : Pat<(int_x86_sse_sqrt_ss VR128:$src), - (SQRTSSr_Int VR128:$src, VR128:$src)>; +multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix, + SDNode Move, ValueType VT, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // With SSE 4.1, blendi is preferred to movs*, so match that too. + let Predicates = [UseSSE41] in { + def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), + (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + + def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), + (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } } -// There is no f64 version of the reciprocal approximation instructions. +defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, + v2f64, UseSSE2>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Non-temporal stores @@ -3910,13 +3653,30 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; +} + let Predicates = [HasAVX, NoVLX] in { def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), - (VMOVNTPSmr addr:$dst, VR128:$src)>; + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; } def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), - (MOVNTPSmr addr:$dst, VR128:$src)>; + (MOVNTDQmr addr:$dst, VR128:$src)>; +def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; +def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; } // AddedComplexity @@ -3945,7 +3705,7 @@ let SchedRW = [WriteLoad] in { // Flush cache def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], - IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>; + IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>; } let SchedRW = [WriteNop] in { @@ -3960,7 +3720,7 @@ let SchedRW = [WriteFence] in { // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, - TB, Requires<[HasSSE1]>; + PS, Requires<[HasSSE1]>; def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, TB, Requires<[HasSSE2]>; @@ -4184,7 +3944,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, string OpcodeStr, SDNode OpNode, SDNode OpNode2, RegisterClass RC, ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, - ShiftOpndItins itins, + PatFrag ld_frag, ShiftOpndItins itins, bit Is2Addr = 1> { // src2 is always 128-bit def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), @@ -4200,10 +3960,10 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (DstVT (OpNode RC:$src1, - (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>, + (bc_frag (ld_frag addr:$src2)))))], itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), @@ -4309,93 +4069,93 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; def VPSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. } } // Predicates = [HasAVX] -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, - VR256, v4i64, v2i64, bc_v2i64, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, - VR256, v4i64, v2i64, bc_v2i64, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, bc_v8i16, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, - VR256, v8i32, v4i32, bc_v4i32, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, - (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, - (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>, + (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; def VPSRLDQYri : PDIi8<0x73, MRM3r, - (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2), + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR256:$dst, - (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>, + (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. } @@ -4403,85 +4163,58 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, - VR128, v2i64, v2i64, bc_v2i64, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, SSE_INTSHIFT_ITINS_P>; defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, - VR128, v4i32, v4i32, bc_v4i32, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, SSE_INTSHIFT_ITINS_P>; -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { // 128-bit logical shifts. def PSLLDQri : PDIi8<0x73, MRM7r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "pslldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; def PSRLDQri : PDIi8<0x73, MRM3r, - (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "psrldq\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))], - IIC_SSE_INTSHDQ_P_RI>; + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; // PSRADQri doesn't exist in SSE[1-3]. } } // Constraints = "$src1 = $dst" let Predicates = [HasAVX] in { - def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; -} - -let Predicates = [HasAVX2] in { - def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2), - (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), - (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; } let Predicates = [UseSSE2] in { - def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), - (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), - (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>; - - // Shift up / down and insert zero's. - def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))), - (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>; - def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))), - (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>; } //===---------------------------------------------------------------------===// @@ -4510,14 +4243,14 @@ multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, SDNode OpNode> { let Predicates = [HasAVX] in { def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, @@ -4528,14 +4261,14 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src1, i8imm:$src2), + (ins i256mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -4546,14 +4279,14 @@ let Predicates = [HasAVX2] in { let Predicates = [UseSSE2] in { def ri : Ii8<0x70, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; def mi : Ii8<0x70, MRMSrcMem, - (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, @@ -4589,7 +4322,7 @@ let Predicates = [UseSSE2] in { let ExeDomain = SSEPackedInt in { multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4607,7 +4340,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OutVT (OpNode VR128:$src1, - (bc_frag (memopv2i64 addr:$src2)))))]>, + (bc_frag (ld_frag addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4626,13 +4359,13 @@ multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OutVT (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2)))))]>, + (bc_frag (loadv4i64 addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4650,7 +4383,7 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OutVT (OpNode VR128:$src1, - (bc_frag (memopv2i64 addr:$src2)))))]>, + (bc_frag (ld_frag addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4669,20 +4402,20 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OutVT (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2)))))]>, + (bc_frag (loadv4i64 addr:$src2)))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -4699,16 +4432,16 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, - bc_v8i16>; + bc_v8i16, memopv2i64>; let Predicates = [HasSSE41] in defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, - bc_v4i32>; + bc_v4i32, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -4718,7 +4451,8 @@ let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedInt in { multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> { + SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -4732,8 +4466,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (OpNode VR128:$src1, - (bc_frag (memopv2i64 - addr:$src2))))], + (bc_frag (ld_frag addr:$src2))))], IIC_SSE_UNPCK>, Sched<[WriteShuffleLd, ReadAfterLd]>; } @@ -4749,28 +4482,28 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpNode VR256:$src1, - (bc_frag (memopv4i64 addr:$src2))))]>, + (bc_frag (loadv4i64 addr:$src2))))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, - bc_v16i8, 0>, VEX_4V; + bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - bc_v2i64, 0>, VEX_4V; + bc_v2i64, loadv2i64, 0>, VEX_4V; defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, - bc_v16i8, 0>, VEX_4V; + bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, - bc_v8i16, 0>, VEX_4V; + bc_v8i16, loadv2i64, 0>, VEX_4V; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, - bc_v4i32, 0>, VEX_4V; + bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, - bc_v2i64, 0>, VEX_4V; + bc_v2i64, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2] in { @@ -4795,22 +4528,22 @@ let Predicates = [HasAVX2] in { let Constraints = "$src1 = $dst" in { defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, - bc_v16i8>; + bc_v16i8, memopv2i64>; defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, - bc_v2i64>; + bc_v2i64, memopv2i64>; defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, - bc_v16i8>; + bc_v16i8, memopv2i64>; defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, - bc_v8i16>; + bc_v8i16, memopv2i64>; defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, - bc_v4i32>; + bc_v4i32, memopv2i64>; defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, - bc_v2i64>; + bc_v2i64, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -4822,7 +4555,7 @@ let ExeDomain = SSEPackedInt in { multiclass sse2_pinsrw<bit Is2Addr = 1> { def rri : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, - GR32orGR64:$src2, i32i8imm:$src3), + GR32orGR64:$src2, u8imm:$src3), !if(Is2Addr, "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -4831,7 +4564,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; def rmi : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, - i16mem:$src2, i32i8imm:$src3), + i16mem:$src2, u8imm:$src3), !if(Is2Addr, "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), @@ -4844,13 +4577,13 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { // Extract let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))]>, PD, VEX, Sched<[WriteShuffle]>; def PEXTRWri : PDIi8<0xC5, MRMSrcReg, - (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2), + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), imm:$src2))], IIC_SSE_PEXTRW>, @@ -4947,6 +4680,10 @@ def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", @@ -4968,6 +4705,10 @@ def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), [(set VR128:$dst, (v2i64 (scalar_to_vector GR64:$src)))], IIC_SSE_MOVDQ>, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", @@ -5054,6 +4795,15 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), IIC_SSE_MOVD_ToGP>; } //SchedRW +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), + (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + //===---------------------------------------------------------------------===// // Bitcast FR64 <-> GR64 // @@ -5132,7 +4882,8 @@ let Predicates = [UseAVX] in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIrr GR32:$src)>; - // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. + // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; @@ -5140,6 +4891,9 @@ let Predicates = [UseAVX] in { (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; } // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, @@ -5186,7 +4940,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", // Move Quadword Int to Packed Quadword Int // -let SchedRW = [WriteLoad] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in { def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -5198,12 +4952,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (v2i64 (scalar_to_vector (loadi64 addr:$src))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix -} // SchedRW +} // ExeDomain, SchedRW //===---------------------------------------------------------------------===// // Move Packed Quadword Int to Quadword Int // -let SchedRW = [WriteStore] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), @@ -5214,7 +4968,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), [(store (i64 (vector_extract (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; -} // SchedRW +} // ExeDomain, SchedRW // For disassembler only let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, @@ -5228,14 +4982,14 @@ def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), //===---------------------------------------------------------------------===// // Store / copy lower 64-bits of a XMM register. // -let Predicates = [UseAVX] in +let Predicates = [HasAVX] in def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), (VMOVPQI2QImr addr:$dst, VR128:$src)>; let Predicates = [UseSSE2] in def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), (MOVPQI2QImr addr:$dst, VR128:$src)>; -let isCodeGenOnly = 1, AddedComplexity = 20 in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in { def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -5251,13 +5005,16 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), (loadi64 addr:$src))))))], IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; -} +} // ExeDomain, isCodeGenOnly, AddedComplexity let Predicates = [UseAVX], AddedComplexity = 20 in { def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), (VMOVZQI2PQIrm addr:$src)>; def : Pat<(v2i64 (X86vzload addr:$src)), (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { @@ -5277,7 +5034,7 @@ def : Pat<(v4i64 (X86vzload addr:$src)), // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. // -let SchedRW = [WriteVecLogic] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -5290,9 +5047,9 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, XS, Requires<[UseSSE2]>; -} // SchedRW +} // ExeDomain, SchedRW -let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -5308,7 +5065,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), IIC_SSE_MOVDQ>, XS, Requires<[UseSSE2]>; } -} // isCodeGenOnly, SchedRW +} // ExeDomain, isCodeGenOnly, SchedRW let AddedComplexity = 20 in { let Predicates = [UseAVX] in { @@ -5387,10 +5144,10 @@ let Predicates = [UseSSE3] in { //===---------------------------------------------------------------------===// multiclass sse3_replicate_dfp<string OpcodeStr> { -let hasSideEffects = 0 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; + [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, @@ -5444,9 +5201,9 @@ let Predicates = [HasAVX] in { let Predicates = [UseAVX, OptForSize] in { def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPrm addr:$src)>; + (VMOVDDUPrm addr:$src)>; def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VMOVDDUPrm addr:$src)>; + (VMOVDDUPrm addr:$src)>; } let Predicates = [UseSSE3] in { @@ -5487,7 +5244,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, OpndItins itins, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { def rr : I<0xD0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -5500,62 +5257,62 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>, + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V; + f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V; + f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem, SSE_ALU_F32P>, XD; + f128mem, SSE_ALU_F32P, memopv4f32>, XD; let ExeDomain = SSEPackedDouble in defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, - f128mem, SSE_ALU_F64P>, PD; + f128mem, SSE_ALU_F64P, memopv2f64>, PD; } // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))), (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))), (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))), (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))), (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))), (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))), (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } @@ -5565,7 +5322,8 @@ let Predicates = [UseSSE3] in { // Horizontal ops multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -5577,11 +5335,12 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -5593,41 +5352,45 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))], + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, loadv4f32, 0>, VEX_4V; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, loadv4f32, 0>, VEX_4V; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, 0>, VEX_4V, VEX_L; + X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, 0>, VEX_4V, VEX_L; + X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - X86fhadd, 0>, VEX_4V; + X86fhadd, loadv2f64, 0>, VEX_4V; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - X86fhsub, 0>, VEX_4V; + X86fhsub, loadv2f64, 0>, VEX_4V; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, 0>, VEX_4V, VEX_L; + X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, 0>, VEX_4V, VEX_L; + X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L; } } let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in { - defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; - defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, + memopv4f32>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, + memopv4f32>; } let ExeDomain = SSEPackedDouble in { - defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; - defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, + memopv2f64>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, + memopv2f64>; } } @@ -5637,8 +5400,8 @@ let Constraints = "$src1 = $dst" in { /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. -multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128> { +multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag> { def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -5650,7 +5413,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (IntId128 - (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>, + (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>; } @@ -5668,7 +5431,7 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (IntId256 - (bitconvert (memopv4i64 addr:$src))))]>, + (bitconvert (loadv4i64 addr:$src))))]>, Sched<[WriteVecALULd]>; } @@ -5683,12 +5446,12 @@ def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; let Predicates = [HasAVX] in { - defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", - int_x86_ssse3_pabs_b_128>, VEX; - defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", - int_x86_ssse3_pabs_w_128>, VEX; - defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", - int_x86_ssse3_pabs_d_128>, VEX; + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128, + loadv2i64>, VEX; + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128, + loadv2i64>, VEX; + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128, + loadv2i64>, VEX; def : Pat<(xor (bc_v2i64 (v16i1sextv16i8)), @@ -5726,12 +5489,12 @@ let Predicates = [HasAVX2] in { (VPABSDrr256 VR256:$src)>; } -defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", - int_x86_ssse3_pabs_b_128>; -defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", - int_x86_ssse3_pabs_w_128>; -defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", - int_x86_ssse3_pabs_d_128>; +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128, + memopv2i64>; +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128, + memopv2i64>; +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128, + memopv2i64>; let Predicates = [HasSSSE3] in { def : Pat<(xor @@ -5803,7 +5566,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, /// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, OpndItins itins, - bit Is2Addr = 1> { + PatFrag ld_frag, bit Is2Addr = 1> { let isCommutable = 1 in def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -5819,7 +5582,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv2i64 addr:$src2))))]>, + (bitconvert (ld_frag addr:$src2))))]>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -5868,17 +5631,17 @@ let isCommutable = 0 in { SSE_PSHUFB, 0>, VEX_4V; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SSE_PHADDSUBSW, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SSE_PHADDSUBSW, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", int_x86_ssse3_pmadd_ub_sw_128, - SSE_PMADD, 0>, VEX_4V; + SSE_PMADD, loadv2i64, 0>, VEX_4V; } defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW, 0>, VEX_4V; + SSE_PMULHRSW, loadv2i64, 0>, VEX_4V; } let ImmT = NoImm, Predicates = [HasAVX2] in { @@ -5943,16 +5706,17 @@ let isCommutable = 0 in { memopv2i64, i128mem, SSE_PSHUFB>; defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", int_x86_ssse3_phadd_sw_128, - SSE_PHADDSUBSW>; + SSE_PHADDSUBSW, memopv2i64>; defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", int_x86_ssse3_phsub_sw_128, - SSE_PHADDSUBSW>; + SSE_PHADDSUBSW, memopv2i64>; defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", - int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>; + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, memopv2i64>; } defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", int_x86_ssse3_pmul_hr_sw_128, - SSE_PMULHRSW>; + SSE_PMULHRSW, memopv2i64>; } //===---------------------------------------------------------------------===// @@ -5962,7 +5726,7 @@ defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { let hasSideEffects = 0 in { def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -5970,7 +5734,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -5982,13 +5746,13 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { let hasSideEffects = 0 in { def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteShuffle]>; let mayLoad = 1 in def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2, i8imm:$src3), + (ins VR256:$src1, i256mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteShuffleLd, ReadAfterLd]>; @@ -6086,10 +5850,10 @@ multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, OpndItins SSEItins, OpndItins AVXItins, OpndItins AVX2Itins> { defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; - let Predicates = [HasAVX] in + let Predicates = [HasAVX, NoVLX] in defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, VR128, VR128, AVXItins>, VEX; - let Predicates = [HasAVX2] in + let Predicates = [HasAVX2, NoVLX] in defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, VR256, VR128, AVX2Itins>, VEX, VEX_L; } @@ -6119,7 +5883,7 @@ defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; // AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { // Register-Register patterns def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; @@ -6137,7 +5901,6 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; // On AVX2, we also support 256bit inputs. - // FIXME: remove these patterns when the old shuffle lowering goes away. def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), @@ -6153,6 +5916,22 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + // Simple Register-Memory patterns + def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + // AVX2 Register-Memory patterns def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; @@ -6209,14 +5988,14 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; } -let Predicates = [HasAVX2] in { - defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; - defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; +let Predicates = [HasAVX2, NoVLX] in { + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; } // SSE4.1/AVX patterns. -multiclass SS41I_pmovx_patterns<string OpcPrefix, SDNode ExtOp, - PatFrag ExtLoad16> { +multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp, PatFrag ExtLoad16> { def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), @@ -6232,6 +6011,21 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, SDNode ExtOp, def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; + def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BWrm) addr:$src)>; def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), @@ -6293,14 +6087,14 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, SDNode ExtOp, (!cast<I>(OpcPrefix#DQrm) addr:$src)>; } -let Predicates = [HasAVX] in { - defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; +let Predicates = [HasAVX, NoVLX] in { + defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// @@ -6310,7 +6104,7 @@ let Predicates = [UseSSE41] in { /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), @@ -6319,11 +6113,11 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i8mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6336,7 +6130,7 @@ defm PEXTRB : SS41I_extract8<0x14, "pextrb">; multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[WriteShuffle]>; @@ -6344,11 +6138,11 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i16mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + imm:$src2)))), addr:$dst)]>; } let Predicates = [HasAVX] in @@ -6360,7 +6154,7 @@ defm PEXTRW : SS41I_extract16<0x15, "pextrw">; /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32:$dst, @@ -6368,7 +6162,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { Sched<[WriteShuffle]>; let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i32mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v4i32 VR128:$src1), imm:$src2), @@ -6383,7 +6177,7 @@ defm PEXTRD : SS41I_extract32<0x16, "pextrd">; /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR64:$dst, @@ -6391,7 +6185,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { Sched<[WriteShuffle]>, REX_W; let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins i64mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v2i64 VR128:$src1), imm:$src2), @@ -6408,7 +6202,7 @@ defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), - (ins VR128:$src1, i32i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR32orGR64:$dst, @@ -6416,7 +6210,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, itins.rr>, Sched<[WriteFBlend]>; let SchedRW = [WriteFBlendLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), - (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), + (ins f32mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), @@ -6447,7 +6241,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6456,7 +6250,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i8mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6473,7 +6267,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR32:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6482,7 +6276,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i32mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6499,7 +6293,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + (ins VR128:$src1, GR64:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6508,7 +6302,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, Sched<[WriteShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), + (ins VR128:$src1, i64mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6530,7 +6324,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6539,7 +6333,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src2, i8imm:$src3), + (ins VR128:$src1, f32mem:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6593,7 +6387,7 @@ let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg def PSr : SS4AIi8<opcps, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], @@ -6601,7 +6395,7 @@ let ExeDomain = SSEPackedSingle in { // Vector intrinsic operation, mem def PSm : SS4AIi8<opcps, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, @@ -6612,7 +6406,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Vector intrinsic operation, reg def PDr : SS4AIi8<opcpd, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], @@ -6620,7 +6414,7 @@ let ExeDomain = SSEPackedDouble in { // Vector intrinsic operation, mem def PDm : SS4AIi8<opcpd, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, @@ -6637,7 +6431,7 @@ let ExeDomain = GenericDomain in { // Operation, reg. let hasSideEffects = 0 in def SSr : SS4AIi8<opcss, MRMSrcReg, - (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3), + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6648,7 +6442,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SSr_Int : SS4AIi8<opcss, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6659,7 +6453,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, mem. def SSm : SS4AIi8<opcss, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6672,7 +6466,7 @@ let ExeDomain = GenericDomain in { // Operation, reg. let hasSideEffects = 0 in def SDr : SS4AIi8<opcsd, MRMSrcReg, - (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3), + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6683,7 +6477,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, reg. let isCodeGenOnly = 1 in def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6694,7 +6488,7 @@ let ExeDomain = GenericDomain in { // Intrinsic operation, mem. def SDm : SS4AIi8<opcsd, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -6720,7 +6514,9 @@ let Predicates = [HasAVX] in { defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; +} +let Predicates = [UseAVX] in { def : Pat<(ffloor FR32:$src), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; def : Pat<(f64 (ffloor FR64:$src)), @@ -6741,7 +6537,9 @@ let Predicates = [HasAVX] in { (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; def : Pat<(f64 (ftrunc FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; +} +let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x1))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), @@ -6945,7 +6743,7 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, + Intrinsic IntId128, PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -6956,7 +6754,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId128 (bitconvert (memopv2i64 addr:$src))))]>, + (IntId128 (bitconvert (ld_frag addr:$src))))]>, Sched<[Sched.Folded]>; } @@ -6964,53 +6762,12 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw, + int_x86_sse41_phminposuw, loadv2i64, WriteVecIMul>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw, + int_x86_sse41_phminposuw, memopv2i64, WriteVecIMul>; -/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator -multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1, - OpndItins itins = DEFAULT_ITINS> { - let isCommutable = 1 in - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], - itins.rr>, Sched<[itins.Sched]>; - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))], - itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; -} - -/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator -multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId256, - X86FoldableSchedWrite Sched> { - let isCommutable = 1 in - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, - Sched<[Sched]>; - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, - Sched<[Sched.Folded, ReadAfterLd]>; -} - - /// SS48I_binop_rm - Simple SSE41 binary operator. multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, @@ -7154,10 +6911,10 @@ let Predicates = [HasAVX, NoVLX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, + loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, VEX_4V, VEX_L; } @@ -7175,7 +6932,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7184,7 +6941,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7196,6 +6953,34 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + itins.rr>, Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -7204,26 +6989,24 @@ let Predicates = [HasAVX] in { } let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; + defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; } - defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V; + defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V; let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, @@ -7245,9 +7028,9 @@ let Predicates = [HasAVX2] in { VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } - defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; + defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { @@ -7257,16 +7040,16 @@ let Constraints = "$src1 = $dst" in { 1, SSE_MPSADBW_ITINS>; } let ExeDomain = SSEPackedSingle in - defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, - VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; + defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; let ExeDomain = SSEPackedDouble in - defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, - VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, - VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_BLEND_P>; + defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; + defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7357,35 +7140,19 @@ let Predicates = [HasAVX] in { def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), (v4f64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - - def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2), - (imm:$mask))), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>; - def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2), - (imm:$mask))), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; } let Predicates = [HasAVX2] in { def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), (v32i8 VR256:$src2))), (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2), - (imm:$mask))), - (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } // Patterns +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseAVX] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a @@ -7402,8 +7169,10 @@ let Predicates = [UseAVX] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, @@ -7417,14 +7186,19 @@ let Predicates = [UseAVX] in { (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; - + // These will incur an FP/int domain crossing penalty, but it may be the only + // way without AVX2. Do not add any complexity because we may be able to match + // more optimal patterns defined earlier in this file. + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; } +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseSSE41] in { // With SSE41 we can use blends for these patterns. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), @@ -7501,17 +7275,6 @@ let Predicates = [UseSSE41] in { def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), (v2f64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - - def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2), - (imm:$mask))), - (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2), - (imm:$mask))), - (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>; - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2), - (imm:$mask))), - (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>; - } let SchedRW = [WriteLoad] in { @@ -7570,30 +7333,32 @@ let Constraints = "$src1 = $dst" in //===----------------------------------------------------------------------===// // Packed Compare Implicit Length Strings, Return Mask -multiclass pseudo_pcmpistrm<string asm> { +multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, imm:$src3))]>; def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, - (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } let Defs = [EFLAGS], usesCustomInserter = 1 in { - defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; - defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>; + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, + Requires<[UseSSE42]>; } multiclass pcmpistrm_SS42AI<string asm> { def rr : SS42AI<0x62, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrM]>; let mayLoad = 1 in def rm :SS42AI<0x62, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; } @@ -7605,30 +7370,32 @@ let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { } // Packed Compare Explicit Length Strings, Return Mask -multiclass pseudo_pcmpestrm<string asm> { +multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; def MEM : PseudoI<(outs VR128:$dst), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, - (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>; + (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; - defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>; + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpestrm<string asm> { def rr : SS42AI<0x60, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrM]>; let mayLoad = 1 in def rm : SS42AI<0x60, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; } @@ -7640,30 +7407,32 @@ let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { } // Packed Compare Implicit Length Strings, Return Index -multiclass pseudo_pcmpistri<string asm> { +multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, - (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>; + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } let Defs = [EFLAGS], usesCustomInserter = 1 in { - defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>; - defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>; + defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpistri<string asm> { def rr : SS42AI<0x63, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrI]>; let mayLoad = 1 in def rm : SS42AI<0x63, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; } @@ -7675,31 +7444,33 @@ let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { } // Packed Compare Explicit Length Strings, Return Index -multiclass pseudo_pcmpestri<string asm> { +multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> { def REG : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), [(set GR32:$dst, EFLAGS, (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; def MEM : PseudoI<(outs GR32:$dst), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), [(set GR32:$dst, EFLAGS, - (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX, + (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; } let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { - defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>; - defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>; + defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>, + Requires<[UseSSE42]>; } multiclass SS42AI_pcmpestri<string asm> { def rr : SS42AI<0x61, MRMSrcReg, (outs), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrI]>; let mayLoad = 1 in def rm : SS42AI<0x61, MRMSrcMem, (outs), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; } @@ -7784,13 +7555,13 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, (i8 imm:$src3)))]>, TA; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, @@ -7818,8 +7589,8 @@ def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", // AES-NI Instructions //===----------------------------------------------------------------------===// -multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1> { +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag, bit Is2Addr = 1> { def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(Is2Addr, @@ -7833,31 +7604,31 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, - (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, + (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, Sched<[WriteAESDecEncLd, ReadAfterLd]>; } // Perform One Round of an AES Encryption/Decryption Flow let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, 0>, VEX_4V; + int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, 0>, VEX_4V; + int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, 0>, VEX_4V; + int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, 0>, VEX_4V; + int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", - int_x86_aesni_aesenc>; + int_x86_aesni_aesenc, memopv2i64>; defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", - int_x86_aesni_aesenclast>; + int_x86_aesni_aesenclast, memopv2i64>; defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", - int_x86_aesni_aesdec>; + int_x86_aesni_aesdec, memopv2i64>; defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", - int_x86_aesni_aesdeclast>; + int_x86_aesni_aesdeclast, memopv2i64>; } // Perform the AES InvMixColumn Transformation @@ -7888,26 +7659,26 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), // AES Round Key Generation Assist let Predicates = [HasAVX, HasAES] in { def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, Sched<[WriteAESKeyGen]>, VEX; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, Sched<[WriteAESKeyGenLd]>, VEX; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, Sched<[WriteAESKeyGen]>; def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, @@ -7918,15 +7689,16 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), //===----------------------------------------------------------------------===// // AVX carry-less Multiplication instructions +let isCommutable = 1 in def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, Sched<[WriteCLMul]>; def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (loadv2i64 addr:$src2), imm:$src3))]>, @@ -7934,15 +7706,16 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), // Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { +let isCommutable = 1 in def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), imm:$src3))], @@ -7981,7 +7754,7 @@ let Predicates = [HasSSE4A] in { let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), - (ins VR128:$src, i8imm:$len, i8imm:$idx), + (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, imm:$idx))]>, PD; @@ -7992,7 +7765,7 @@ def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), VR128:$mask))]>, PD; def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx), + (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, VR128:$src2, imm:$len, imm:$idx))]>, XD; @@ -8071,9 +7844,9 @@ def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, WriteFShuffle256>, VEX_L; let Predicates = [HasAVX2] in -def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128, WriteLoad>, - VEX_L; +def VBROADCASTI128 : avx_broadcast_no_int<0x5A, "vbroadcasti128", VR256, + i128mem, v4i64, loadv2i64, + WriteLoad>, VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), @@ -8085,12 +7858,12 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), // let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR128:$src2, i8imm:$src3), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f128mem:$src2, i8imm:$src3), + (ins VR256:$src1, f128mem:$src2, u8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } @@ -8115,49 +7888,6 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), (INSERT_get_vinsert128_imm VR256:$ins))>; } -// Combine two consecutive 16-byte loads with a common destination register into -// one 32-byte load to that register. -let Predicates = [HasAVX, HasFastMem32] in { - def : Pat<(insert_subvector - (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), - (loadv4f32 (add addr:$src, (iPTR 16))), - (iPTR 4)), - (VMOVUPSYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), - (loadv2f64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVUPDYrm addr:$src)>; - - def : Pat<(insert_subvector - (v32i8 (insert_subvector - undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 16)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v16i16 (insert_subvector - undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 8)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v8i32 (insert_subvector - undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), - (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), - (iPTR 4)), - (VMOVDQUYrm addr:$src)>; - - def : Pat<(insert_subvector - (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), - (loadv2i64 (add addr:$src, (iPTR 16))), - (iPTR 2)), - (VMOVDQUYrm addr:$src)>; -} - let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), @@ -8202,12 +7932,12 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), // let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteFShuffle]>, VEX, VEX_L; let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR256:$src1, i8imm:$src2), + (ins f128mem:$dst, VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteStore]>, VEX, VEX_L; } @@ -8328,15 +8058,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, Sched<[WriteFShuffleLd, ReadAfterLd]>; def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), + (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), - (ins x86memop_f:$src1, i8imm:$src2), + (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8393,13 +8123,13 @@ def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), // let ExeDomain = SSEPackedSingle in { def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$src3))))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L, @@ -8468,14 +8198,14 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), - (ins RC:$src1, i32i8imm:$src2), + (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in def mr : Ii8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), + (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, TAPD, VEX; } @@ -8491,6 +8221,18 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; + + def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16 + (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), + addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16 + (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), + addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), + addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -8512,38 +8254,31 @@ let Predicates = [HasF16C] in { // AVX2 Instructions //===----------------------------------------------------------------------===// -/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate -multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, - Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { +/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), + (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, - (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, - VR128, loadv2i64, i128mem>; -defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, - VR256, loadv4i64, i256mem>, VEX_L; - -def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), - imm:$mask)), - (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>; -def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), - imm:$mask)), - (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>; +defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem>, VEX_L; //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the @@ -8628,7 +8363,7 @@ let Predicates = [HasAVX2] in { def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; - // Provide aliases for broadcast from the same regitser class that + // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), @@ -8733,6 +8468,8 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2i64 (X86VBroadcast i64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; } //===----------------------------------------------------------------------===// @@ -8765,14 +8502,14 @@ defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, i8imm:$src2), + (ins VR256:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src1, i8imm:$src2), + (ins i256mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -8791,13 +8528,13 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks // def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i8imm:$src3), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), (i8 imm:$src3)))]>, @@ -8828,12 +8565,12 @@ def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), // let hasSideEffects = 0 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR128:$src2, i8imm:$src3), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; let mayLoad = 1 in def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i128mem:$src2, i8imm:$src3), + (ins VR256:$src1, i128mem:$src2, u8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } @@ -8881,14 +8618,12 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), // VEXTRACTI128 - Extract packed integer values // def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), - (ins VR256:$src1, i8imm:$src2), - "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, - (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, + (ins VR256:$src1, u8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteShuffle256]>, VEX, VEX_L; let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR256:$src1, i8imm:$src2), + (ins i128mem:$dst, VR256:$src1, u8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[WriteStore]>, VEX, VEX_L; diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index c706d43..caecf70 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -850,12 +850,12 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, def ROT32L2R_imm8 : SDNodeXForm<imm, [{ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. - return getI8Imm(32 - N->getZExtValue()); + return getI8Imm(32 - N->getZExtValue(), SDLoc(N)); }]>; def ROT64L2R_imm8 : SDNodeXForm<imm, [{ // Convert a ROTL shamt to a ROTR shamt on 64-bit integer. - return getI8Imm(64 - N->getZExtValue()); + return getI8Imm(64 - N->getZExtValue(), SDLoc(N)); }]>; multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 848eeeb..0350566 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -38,9 +38,6 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))], IIC_INT3>; } // SchedRW -def : Pat<(debugtrap), - (INT3)>; - // The long form of "int $3" turns into int3 as a size optimization. // FIXME: This doesn't work because InstAlias can't match immediate constants. //def : InstAlias<"int\t$3", (INT3)>; @@ -71,6 +68,10 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, Requires<[In64BitMode]>; } // SchedRW +def : Pat<(debugtrap), + (INT3)>, Requires<[NotPS4]>; +def : Pat<(debugtrap), + (INT (i8 0x41))>, Requires<[IsPS4]>; //===----------------------------------------------------------------------===// // Input/Output Instructions. @@ -437,7 +438,9 @@ def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), //===----------------------------------------------------------------------===// // Specialized register support let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; +let Defs = [EAX, EDX], Uses = [ECX] in def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; let Defs = [RAX, RDX], Uses = [ECX] in @@ -485,15 +488,28 @@ let Uses = [RDX, RAX] in { def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), "xsave\t$dst", []>, TB; def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), "xrstor\t$dst", []>, TB; def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>; def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt\t$dst", []>, TB; + "xsaveopt\t$dst", []>, PS; def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt{q|64}\t$dst", []>, TB, Requires<[In64BitMode]>; + "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>; + + def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors\t$dst", []>, TB; + def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>; + def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), + "xsavec\t$dst", []>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), + "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>; + def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), + "xsaves\t$dst", []>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), + "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>; } } // SchedRW @@ -559,7 +575,13 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), //===----------------------------------------------------------------------===// // SMAP Instruction -let Predicates = [HasSMAP], Defs = [EFLAGS] in { +let Defs = [EFLAGS] in { def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; } + +//===----------------------------------------------------------------------===// +// SMX Instruction +let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index 45e2ff0..8455b8d 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -20,21 +20,23 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>; -defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>; -defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>; -defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>; -defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>; -defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>; -defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>; -defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>; -defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>; -defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>; -defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>; -defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>; -defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>; -defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>; -defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>; +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} // Scalar load 2 addr operand instructions multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, @@ -47,11 +49,6 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; } -defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32>; -defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64>; - multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -62,9 +59,6 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>; -defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>; - multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), @@ -75,8 +69,19 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L; } -defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>; -defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>; +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +} multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), @@ -87,28 +92,30 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>, + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>, XOP_4V, VEX_W; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>, + (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>, XOP_4VOp3; } -defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; -defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; -defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; -defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; -defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; -defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; -defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; -defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; -defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; -defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; -defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; -defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +let ExeDomain = SSEPackedInt in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), @@ -119,16 +126,19 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { (ins i128mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, XOP; + (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; } -defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; -defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; -defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; -defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +let ExeDomain = SSEPackedInt in { + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +} // Instruction where second source can be memory, but third must be register multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { + let isCommutable = 1 in def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -140,48 +150,66 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), VR128:$src3))]>, XOP_4V, VEX_I8IMM; } -defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; -defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; -defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; -defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; -defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; -defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; -defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; -defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; -defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; -defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; -defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; -defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} // Instruction where second source can be memory, third must be imm8 -multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { + let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>, + (ins VR128:$src1, VR128:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>, XOP_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), - imm:$src3))]>, XOP_4V; + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + i8immZExt3:$cc))]>, XOP_4V; + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + let mayLoad = 1 in + def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + } } -defm VPCOMB : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>; -defm VPCOMW : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>; -defm VPCOMD : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>; -defm VPCOMQ : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>; -defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>; -defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>; -defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>; -defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>; +let ExeDomain = SSEPackedInt in { // SSE integer instructions + defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; + defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; + defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; + defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; + defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; + defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +} // Instruction where either second or third source can be memory multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { @@ -197,20 +225,22 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - (bitconvert (memopv2i64 addr:$src3))))]>, + (bitconvert (loadv2i64 addr:$src3))))]>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4; def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)), + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), VR128:$src3))]>, XOP_4V, VEX_I8IMM; } -defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; -defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), @@ -225,19 +255,20 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, - (bitconvert (memopv4i64 addr:$src3))))]>, + (bitconvert (loadv4i64 addr:$src3))))]>, XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, - (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)), + (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)), VR256:$src3))]>, XOP_4V, VEX_I8IMM, VEX_L; } -defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let ExeDomain = SSEPackedInt in + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { @@ -282,8 +313,11 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, VEX_L; } -defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, - int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>; -defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>; +let ExeDomain = SSEPackedDouble in + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + +let ExeDomain = SSEPackedSingle in + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index b411d07..4af514a 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1,4 +1,4 @@ -//===-- X86IntinsicsInfo.h - X86 Instrinsics ------------*- C++ -*-===// +//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -21,8 +21,9 @@ enum IntrinsicType { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, - COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, + INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, + EXPAND_FROM_MEM, BLEND }; struct IntrinsicData { @@ -171,77 +172,101 @@ static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { - X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), - X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), - X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), - X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0), - X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), - X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), - X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), - X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), - X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), - X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), - X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), - X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), - X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), - X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), - X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), - X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), - X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), - X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0), @@ -260,18 +285,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), - X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, + X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, + X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, @@ -297,6 +330,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, + X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, @@ -321,6 +366,96 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), @@ -345,6 +480,33 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), @@ -363,6 +525,52 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, + X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), @@ -375,10 +583,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), @@ -391,35 +605,85 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_512, FMA_OP_MASK, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_512, FMA_OP_MASK, X86ISD::FMSUB, + X86ISD::FMSUB_RND), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_512, FMA_OP_MASK, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB , 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_512, FMA_OP_MASK, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), @@ -501,6 +765,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 63ccded..556b518 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCStreamer.h" @@ -74,11 +75,11 @@ namespace llvm { X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {} void - X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) { + X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) { + MF = &F; CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *TM.getSubtargetImpl()->getInstrInfo(), - *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(), - MF.getContext())); + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo(), MF->getContext())); } void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, @@ -87,7 +88,7 @@ namespace llvm { SmallString<256> Code; SmallVector<MCFixup, 4> Fixups; raw_svector_ostream VecOS(Code); - CodeEmitter->EncodeInstruction(Inst, VecOS, Fixups, STI); + CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI); VecOS.flush(); CurrentShadowSize += Code.size(); if (CurrentShadowSize >= RequiredShadowSize) @@ -100,20 +101,20 @@ namespace llvm { if (InShadow && CurrentShadowSize < RequiredShadowSize) { InShadow = false; EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, - TM.getSubtarget<X86Subtarget>().is64Bit(), STI); + MF->getSubtarget<X86Subtarget>().is64Bit(), STI); } } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer.EmitInstruction(Inst, getSubtargetInfo()); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo()); } } // end llvm namespace X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) -: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), - MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} + : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()), + AsmPrinter(asmprinter) {} MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>(); @@ -124,7 +125,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout(); + const DataLayout *DL = TM.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; @@ -154,14 +155,17 @@ GetSymbolFromOperand(const MachineOperand &MO) const { const GlobalValue *GV = MO.getGlobal(); AsmPrinter.getNameWithPrefix(Name, GV); } else if (MO.isSymbol()) { - getMang()->getNameWithPrefix(Name, MO.getSymbolName()); + if (MO.getTargetFlags() == X86II::MO_NOPREFIX) + Name += MO.getSymbolName(); + else + getMang()->getNameWithPrefix(Name, MO.getSymbolName()); } else if (MO.isMBB()) { Name += MO.getMBB()->getSymbol()->getName(); } unsigned OrigLen = Name.size() - PrefixLen; Name += Suffix; - MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Name); StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen); @@ -208,7 +212,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } else { StubSym = MachineModuleInfoImpl:: - StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false); + StubValueTy(Ctx.getOrCreateSymbol(OrigName), false); } break; } @@ -231,6 +235,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, case X86II::MO_DARWIN_NONLAZY: case X86II::MO_DLLIMPORT: case X86II::MO_DARWIN_STUB: + case X86II::MO_NOPREFIX: break; case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break; @@ -270,8 +275,8 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, // relocations the assembler will generate for differences between // local labels. This is only safe when the symbols are in the same // section so we are restricting it to jumptable references. - MCSymbol *Label = Ctx.CreateTempSymbol(); - AsmPrinter.OutStreamer.EmitAssignment(Label, Expr); + MCSymbol *Label = Ctx.createTempSymbol(); + AsmPrinter.OutStreamer->EmitAssignment(Label, Expr); Expr = MCSymbolRefExpr::Create(Label, Ctx); } break; @@ -284,7 +289,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); - return MCOperand::CreateExpr(Expr); + return MCOperand::createExpr(Expr); } @@ -408,10 +413,10 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_Register: // Ignore all implicit register operands. if (MO.isImplicit()) continue; - MCOp = MCOperand::CreateReg(MO.getReg()); + MCOp = MCOperand::createReg(MO.getReg()); break; case MachineOperand::MO_Immediate: - MCOp = MCOperand::CreateImm(MO.getImm()); + MCOp = MCOperand::createImm(MO.getImm()); break; case MachineOperand::MO_MachineBasicBlock: case MachineOperand::MO_GlobalAddress: @@ -509,6 +514,7 @@ ReSimplify: // inputs modeled as normal uses instead of implicit uses. As such, truncate // off all but the first operand (the callee). FIXME: Change isel. case X86::TAILJMPr64: + case X86::TAILJMPr64_REX: case X86::CALL64r: case X86::CALL64pcrel32: { unsigned Opcode = OutMI.getOpcode(); @@ -682,7 +688,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, bool needsPadding = MI.getOpcode() == X86::TLS_addr64; - MCContext &context = OutStreamer.getContext(); + MCContext &context = OutStreamer->getContext(); if (needsPadding) EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); @@ -709,28 +715,28 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, MCInst LEA; if (is64Bits) { LEA.setOpcode(X86::LEA64r); - LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest - LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base - LEA.addOperand(MCOperand::CreateImm(1)); // scale - LEA.addOperand(MCOperand::CreateReg(0)); // index - LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp - LEA.addOperand(MCOperand::CreateReg(0)); // seg + LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest + LEA.addOperand(MCOperand::createReg(X86::RIP)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(0)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base - LEA.addOperand(MCOperand::CreateImm(1)); // scale - LEA.addOperand(MCOperand::CreateReg(0)); // index - LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp - LEA.addOperand(MCOperand::CreateReg(0)); // seg + LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::createReg(X86::EBX)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(0)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg } else { LEA.setOpcode(X86::LEA32r); - LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest - LEA.addOperand(MCOperand::CreateReg(0)); // base - LEA.addOperand(MCOperand::CreateImm(1)); // scale - LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index - LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp - LEA.addOperand(MCOperand::CreateReg(0)); // seg + LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::createReg(0)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(X86::EBX)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg } EmitAndCountInstruction(LEA); @@ -741,7 +747,7 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, } StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; - MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name); + MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name); const MCSymbolRefExpr *tlsRef = MCSymbolRefExpr::Create(tlsGetAddr, MCSymbolRefExpr::VK_PLT, @@ -803,51 +809,53 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu } // while (NumBytes) } -static void LowerSTATEPOINT(MCStreamer &OS, StackMaps &SM, - const MachineInstr &MI, bool Is64Bit, - const TargetMachine& TM, - const MCSubtargetInfo& STI, - X86MCInstLower &MCInstLowering) { - assert(Is64Bit && "Statepoint currently only supports X86-64"); - - // Lower call target and choose correct opcode - const MachineOperand &call_target = StatepointOpers(&MI).getCallTarget(); - MCOperand call_target_mcop; - unsigned call_opcode; - switch (call_target.getType()) { - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - call_target_mcop = MCInstLowering.LowerSymbolOperand( - call_target, - MCInstLowering.GetSymbolFromOperand(call_target)); - call_opcode = X86::CALL64pcrel32; - // Currently, we only support relative addressing with statepoints. - // Otherwise, we'll need a scratch register to hold the target - // address. You'll fail asserts during load & relocation if this - // symbol is to far away. (TODO: support non-relative addressing) - break; - case MachineOperand::MO_Immediate: - call_target_mcop = MCOperand::CreateImm(call_target.getImm()); - call_opcode = X86::CALL64pcrel32; - // Currently, we only support relative addressing with statepoints. - // Otherwise, we'll need a scratch register to hold the target - // immediate. You'll fail asserts during load & relocation if this - // address is to far away. (TODO: support non-relative addressing) - break; - case MachineOperand::MO_Register: - call_target_mcop = MCOperand::CreateReg(call_target.getReg()); - call_opcode = X86::CALL64r; - break; - default: - llvm_unreachable("Unsupported operand type in statepoint call target"); - break; - } +void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, + X86MCInstLower &MCIL) { + assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64"); + + StatepointOpers SOpers(&MI); + if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { + EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(), + getSubtargetInfo()); + } else { + // Lower call target and choose correct opcode + const MachineOperand &CallTarget = SOpers.getCallTarget(); + MCOperand CallTargetMCOp; + unsigned CallOpcode; + switch (CallTarget.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + CallTargetMCOp = MCIL.LowerSymbolOperand( + CallTarget, MCIL.GetSymbolFromOperand(CallTarget)); + CallOpcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // address. You'll fail asserts during load & relocation if this + // symbol is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Immediate: + CallTargetMCOp = MCOperand::createImm(CallTarget.getImm()); + CallOpcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // immediate. You'll fail asserts during load & relocation if this + // address is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Register: + CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); + CallOpcode = X86::CALL64r; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } - // Emit call - MCInst call_inst; - call_inst.setOpcode(call_opcode); - call_inst.addOperand(call_target_mcop); - OS.EmitInstruction(call_inst, STI); + // Emit call + MCInst CallInst; + CallInst.setOpcode(CallOpcode); + CallInst.addOperand(CallTargetMCOp); + OutStreamer->EmitInstruction(CallInst, getSubtargetInfo()); + } // Record our statepoint node in the same section used by STACKMAP // and PATCHPOINT @@ -858,7 +866,7 @@ static void LowerSTATEPOINT(MCStreamer &OS, StackMaps &SM, // Lower a stackmap of the form: // <id>, <shadowBytes>, ... void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { - SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); SM.recordStackMap(MI); unsigned NumShadowBytes = MI.getOperand(1).getImm(); SMShadowTracker.reset(NumShadowBytes); @@ -866,18 +874,40 @@ void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { // Lower a patchpoint of the form: // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... -void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI) { +void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, + X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64"); - SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); SM.recordPatchPoint(MI); PatchPointOpers opers(&MI); unsigned ScratchIdx = opers.getNextScratchIdx(); unsigned EncodedBytes = 0; - int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); - if (CallTarget) { + const MachineOperand &CalleeMO = + opers.getMetaOper(PatchPointOpers::TargetPos); + + // Check for null target. If target is non-null (i.e. is non-zero or is + // symbolic) then emit a call. + if (!(CalleeMO.isImm() && !CalleeMO.getImm())) { + MCOperand CalleeMCOp; + switch (CalleeMO.getType()) { + default: + /// FIXME: Add a verifier check for bad callee types. + llvm_unreachable("Unrecognized callee operand type."); + case MachineOperand::MO_Immediate: + if (CalleeMO.getImm()) + CalleeMCOp = MCOperand::createImm(CalleeMO.getImm()); + break; + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + CalleeMCOp = + MCIL.LowerSymbolOperand(CalleeMO, + MCIL.GetSymbolFromOperand(CalleeMO)); + break; + } + // Emit MOV to materialize the target address and the CALL to target. // This is encoded with 12-13 bytes, depending on which register is used. unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg(); @@ -885,16 +915,18 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI) { EncodedBytes = 13; else EncodedBytes = 12; - EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) - .addImm(CallTarget)); + + EmitAndCountInstruction( + MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } + // Emit padding. unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); - EmitNops(OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), + EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), getSubtargetInfo()); } @@ -988,8 +1020,7 @@ static std::string getShuffleComment(const MachineOperand &DstOp, void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); - const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>( - TM.getSubtargetImpl()->getRegisterInfo()); + const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: @@ -997,7 +1028,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // Emit nothing here but a comment if we can. case X86::Int_MemBarrier: - OutStreamer.emitRawComment("MEMBARRIER"); + OutStreamer->emitRawComment("MEMBARRIER"); return; @@ -1005,15 +1036,21 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::EH_RETURN64: { // Lower these as normal, but add some comments. unsigned Reg = MI->getOperand(0).getReg(); - OutStreamer.AddComment(StringRef("eh_return, addr: %") + - X86ATTInstPrinter::getRegisterName(Reg)); + OutStreamer->AddComment(StringRef("eh_return, addr: %") + + X86ATTInstPrinter::getRegisterName(Reg)); break; } case X86::TAILJMPr: + case X86::TAILJMPm: case X86::TAILJMPd: + case X86::TAILJMPr64: + case X86::TAILJMPm64: case X86::TAILJMPd64: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: + case X86::TAILJMPd64_REX: // Lower these as normal, but add some comments. - OutStreamer.AddComment("TAILCALL"); + OutStreamer->AddComment("TAILCALL"); break; case X86::TLS_addr32: @@ -1037,7 +1074,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); // Emit the label. - OutStreamer.EmitLabel(PICBase); + OutStreamer->EmitLabel(PICBase); // popl $reg EmitAndCountInstruction(MCInstBuilder(X86::POP32r) @@ -1057,8 +1094,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // MYGLOBAL + (. - PICBASE) // However, we can't generate a ".", so just emit a new label here and refer // to it. - MCSymbol *DotSym = OutContext.CreateTempSymbol(); - OutStreamer.EmitLabel(DotSym); + MCSymbol *DotSym = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(DotSym); // Now that we have emitted the label, lower the complex operand expression. MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); @@ -1078,14 +1115,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case TargetOpcode::STATEPOINT: - return LowerSTATEPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), TM, - getSubtargetInfo(), MCInstLowering); + return LowerSTATEPOINT(*MI, MCInstLowering); case TargetOpcode::STACKMAP: return LowerSTACKMAP(*MI); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(*MI); + return LowerPATCHPOINT(*MI, MCInstLowering); case X86::MORESTACK_RET: EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); @@ -1100,34 +1136,34 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; case X86::SEH_PushReg: - OutStreamer.EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); + OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); return; case X86::SEH_SaveReg: - OutStreamer.EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); + OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); return; case X86::SEH_SaveXMM: - OutStreamer.EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); + OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); return; case X86::SEH_StackAlloc: - OutStreamer.EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); return; case X86::SEH_SetFrame: - OutStreamer.EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); + OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); return; case X86::SEH_PushFrame: - OutStreamer.EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); return; case X86::SEH_EndPrologue: - OutStreamer.EmitWinCFIEndProlog(); + OutStreamer->EmitWinCFIEndProlog(); return; case X86::SEH_Epilogue: { @@ -1151,7 +1187,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::PSHUFBrm: case X86::VPSHUFBrm: case X86::VPSHUFBYrm: { - if (!OutStreamer.isVerboseAsm()) + if (!OutStreamer->isVerboseAsm()) break; assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!"); @@ -1163,7 +1199,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); } break; } @@ -1171,7 +1207,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMILPDrm: case X86::VPERMILPSYrm: case X86::VPERMILPDYrm: { - if (!OutStreamer.isVerboseAsm()) + if (!OutStreamer->isVerboseAsm()) break; assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!"); @@ -1183,7 +1219,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMILPMask(C, Mask); if (!Mask.empty()) - OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); } break; } @@ -1208,7 +1244,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::MOVDQUrm: case X86::VMOVDQUrm: case X86::VMOVDQUYrm: - if (!OutStreamer.isVerboseAsm()) + if (!OutStreamer->isVerboseAsm()) break; if (MI->getNumOperands() > 4) if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { @@ -1231,7 +1267,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CS << "?"; } CS << "]"; - OutStreamer.AddComment(CS.str()); + OutStreamer->AddComment(CS.str()); } else if (auto *CV = dyn_cast<ConstantVector>(C)) { CS << "<"; for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { @@ -1251,7 +1287,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer.AddComment(CS.str()); + OutStreamer->AddComment(CS.str()); } } break; @@ -1269,9 +1305,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SMShadowTracker.count(TmpInst, getSubtargetInfo()); // Then flush the shadow so that we fill with nops before the call, not // after it. - SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); // Then emit the call - OutStreamer.EmitInstruction(TmpInst, getSubtargetInfo()); + OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo()); return; } diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 9fd03a7..d598b55 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -50,6 +50,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// ReturnAddrIndex - FrameIndex for return slot. int ReturnAddrIndex; + /// \brief FrameIndex for return slot. + int FrameAddrIndex; + /// TailCallReturnAddrDelta - The number of bytes by which return address /// stack slot is moved as the result of tail call optimization. int TailCallReturnAddrDelta; @@ -92,6 +95,7 @@ public: CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), + FrameAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), @@ -109,6 +113,7 @@ public: CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), + FrameAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), @@ -139,6 +144,9 @@ public: int getRAIndex() const { return ReturnAddrIndex; } void setRAIndex(int Index) { ReturnAddrIndex = Index; } + int getFAIndex() const { return FrameAddrIndex; } + void setFAIndex(int Index) { FrameAddrIndex = Index; } + int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index adc05b2..143e70b 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -51,7 +51,7 @@ namespace { struct PadShortFunc : public MachineFunctionPass { static char ID; PadShortFunc() : MachineFunctionPass(ID) - , Threshold(4), TM(nullptr), TII(nullptr) {} + , Threshold(4), STI(nullptr), TII(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -79,7 +79,7 @@ namespace { // VisitedBBs - Cache of previously visited BBs. DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs; - const TargetMachine *TM; + const X86Subtarget *STI; const TargetInstrInfo *TII; }; @@ -93,19 +93,16 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - const AttributeSet &FnAttrs = MF.getFunction()->getAttributes(); - if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize)) { + if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || + MF.getFunction()->hasFnAttribute(Attribute::MinSize)) { return false; } - TM = &MF.getTarget(); - if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) + STI = &MF.getSubtarget<X86Subtarget>(); + if (!STI->padShortFunctions()) return false; - TII = TM->getSubtargetImpl()->getInstrInfo(); + TII = STI->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); @@ -195,8 +192,7 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, return true; } - CyclesToEnd += TII->getInstrLatency( - TM->getSubtargetImpl()->getInstrItineraryData(), MI); + CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI); } VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index 0fa38f4..1f36163 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -53,34 +54,35 @@ static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); -X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) - : X86GenRegisterInfo( - (STI.is64Bit() ? X86::RIP : X86::EIP), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), - X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), - (STI.is64Bit() ? X86::RIP : X86::EIP)), - Subtarget(STI) { +X86RegisterInfo::X86RegisterInfo(const Triple &TT) + : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), + (TT.isArch64Bit() ? X86::RIP : X86::EIP)) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. - Is64Bit = Subtarget.is64Bit(); - IsWin64 = Subtarget.isTargetWin64(); + Is64Bit = TT.isArch64Bit(); + IsWin64 = Is64Bit && TT.isOSWindows(); + // Use a callee-saved register as the base pointer. These registers must + // not conflict with any ABI requirements. For example, in 32-bit mode PIC + // requires GOT in the EBX register before function calls via PLT GOT pointer. if (Is64Bit) { SlotSize = 8; - StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? - X86::RSP : X86::ESP; - FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? - X86::RBP : X86::EBP; + // This matches the simplified 32-bit pointer code in the data layout + // computation. + // FIXME: Should use the data layout? + bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32; + StackPtr = Use64BitReg ? X86::RSP : X86::ESP; + FramePtr = Use64BitReg ? X86::RBP : X86::EBP; + BasePtr = Use64BitReg ? X86::RBX : X86::EBX; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; + BasePtr = X86::ESI; } - // Use a callee-saved register as the base pointer. These registers must - // not conflict with any ABI requirements. For example, in 32-bit mode PIC - // requires GOT in the EBX register before function calls via PLT GOT pointer. - BasePtr = Is64Bit ? X86::RBX : X86::ESI; } bool @@ -119,8 +121,9 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } -const TargetRegisterClass* -X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ +const TargetRegisterClass * +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { // Don't allow super-classes of GR8_NOREX. This class is only used after // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the @@ -160,6 +163,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. @@ -171,9 +175,9 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; case 2: // Available for tailcall (not callee-saved GPRs). - if (Subtarget.isTargetWin64()) + if (IsWin64) return &X86::GR64_TCW64RegClass; - else if (Subtarget.is64Bit()) + else if (Is64Bit) return &X86::GR64_TCRegClass; const Function *F = MF.getFunction(); @@ -209,7 +213,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: - return Subtarget.is64Bit() ? 10 : 4; + return Is64Bit ? 10 : 4; case X86::VR64RegClassID: return 4; } @@ -217,8 +221,10 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); + bool CallsEHReturn = MF->getMMI().callsEHReturn(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { @@ -252,11 +258,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Is64Bit) return CSR_64_MostRegs_SaveList; break; + case CallingConv::X86_64_Win64: + return CSR_Win64_SaveList; + case CallingConv::X86_64_SysV: + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; default: break; } - bool CallsEHReturn = MF->getMMI().callsEHReturn(); if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; @@ -269,8 +280,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } -const uint32_t* -X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { +const uint32_t * +X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -307,6 +320,10 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { break; default: break; + case CallingConv::X86_64_Win64: + return CSR_Win64_RegMask; + case CallingConv::X86_64_SysV: + return CSR_64_RegMask; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -348,13 +365,15 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { CallingConv::ID CC = MF.getFunction()->getCallingConv(); - const uint32_t* RegMask = getCallPreservedMask(CC); + const uint32_t *RegMask = getCallPreservedMask(MF, CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true); + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64, + false); + for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); } @@ -390,7 +409,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*AI); } } - if (!Is64Bit || !Subtarget.hasAVX512()) { + if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); @@ -445,10 +464,8 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const Function *F = MF.getFunction(); unsigned StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); + bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || + F->hasFnAttribute(Attribute::StackAlignment)); // If we've requested that we force align the stack do so now. if (ForceStackAlign) @@ -475,7 +492,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned BasePtr; unsigned Opc = MI.getOpcode(); - bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; + bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || + Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; if (hasBasePointer(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); else if (needsStackRealignment(MF)) @@ -485,6 +503,24 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // FRAME_ALLOC uses a single offset, with no register. It only works in the + // simple FP case, and doesn't work with stack realignment. On 32-bit, the + // offset is from the traditional base pointer location. On 64-bit, the + // offset is from the SP at the end of the prologue, not the FP location. This + // matches the behavior of llvm.frameaddress. + if (Opc == TargetOpcode::FRAME_ALLOC) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int Offset; + if (IsWinEH) + Offset = static_cast<const X86FrameLowering *>(TFI) + ->getFrameIndexOffsetFromSP(MF, FrameIndex); + else + Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + FI.ChangeToImmediate(Offset); + return; + } + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. @@ -536,8 +572,9 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { return TFI->hasFP(MF) ? FramePtr : StackPtr; } -unsigned X86RegisterInfo::getPtrSizedFrameRegister( - const MachineFunction &MF) const { +unsigned +X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 406b1fc..a714c2a 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -20,14 +20,9 @@ #include "X86GenRegisterInfo.inc" namespace llvm { - class Type; - class TargetInstrInfo; - class X86Subtarget; + class Triple; class X86RegisterInfo final : public X86GenRegisterInfo { -public: - const X86Subtarget &Subtarget; - private: /// Is64Bit - Is the target 64-bits. /// @@ -55,7 +50,7 @@ private: unsigned BasePtr; public: - X86RegisterInfo(const X86Subtarget &STI); + X86RegisterInfo(const Triple &TT); // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; @@ -76,8 +71,9 @@ public: getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const override; - const TargetRegisterClass* - getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. @@ -98,7 +94,8 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index 30cd09a..2e735fa 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -325,7 +325,7 @@ def GR8 : RegisterClass<"X86", [i8], 8, R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { let AltOrders = [(sub GR8, AH, BH, CH, DH)]; let AltOrderSelect = [{ - return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + return MF.getSubtarget<X86Subtarget>().is64Bit(); }]; } @@ -377,7 +377,7 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, AH, CH, DH, BL, BH)> { let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; let AltOrderSelect = [{ - return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + return MF.getSubtarget<X86Subtarget>().is64Bit(); }]; } // GR16_NOREX - GR16 registers which do not require a REX prefix. @@ -469,18 +469,18 @@ def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} -def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} -def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} -def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} +def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} -def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} -def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} -def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} +def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 73a3230..677e824 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -1895,7 +1895,7 @@ def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; // x,m / v,v,m. def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 4; + let Latency = 9; let NumMicroOps = 2; let ResourceCycles = [1, 1]; } @@ -2014,7 +2014,7 @@ def : InstRW<[WriteFMADDr], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", // 4p forms. @@ -2031,7 +2031,7 @@ def : InstRW<[WriteFMADDm], // 3p forms. "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m", + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", // 4s/4s_int forms. "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", // 4p forms. diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 821044f..5ca40bc 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -57,12 +57,13 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); #ifndef NDEBUG // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, - X86::ECX, X86::EAX, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; assert(!isBaseRegConflictPossible(DAG, ClobberSet)); #endif @@ -137,22 +138,22 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, default: // Byte aligned AVT = MVT::i8; ValReg = X86::AL; - Count = DAG.getIntPtrConstant(SizeVal); + Count = DAG.getIntPtrConstant(SizeVal, dl); break; } if (AVT.bitsGT(MVT::i8)) { unsigned UBytes = AVT.getSizeInBits() / 8; - Count = DAG.getIntPtrConstant(SizeVal / UBytes); + Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl); BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; - Count = DAG.getIntPtrConstant(SizeVal); + Count = DAG.getIntPtrConstant(SizeVal, dl); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); InFlag = Chain.getValue(1); } @@ -173,7 +174,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, Count = Size; EVT CVT = Count.getValueType(); SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, - DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, + CVT)); Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, Left, InFlag); @@ -189,27 +191,26 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, Chain = DAG.getMemset(Chain, dl, DAG.getNode(ISD::ADD, dl, AddrVT, Dst, - DAG.getConstant(Offset, AddrVT)), + DAG.getConstant(Offset, dl, AddrVT)), Src, - DAG.getConstant(BytesLeft, SizeVT), - Align, isVolatile, DstPtrInfo.getWithOffset(Offset)); + DAG.getConstant(BytesLeft, dl, SizeVT), + Align, isVolatile, false, + DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. return Chain; } -SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, - SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, bool AlwaysInline, - MachinePointerInfo DstPtrInfo, - MachinePointerInfo SrcPtrInfo) const { +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); @@ -229,8 +230,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, return SDValue(); // If the base register might conflict with our physical registers, bail out. - unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, - X86::ECX, X86::ESI, X86::EDI}; + const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); @@ -248,7 +249,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; - SDValue Count = DAG.getIntPtrConstant(CountVal); + SDValue Count = DAG.getIntPtrConstant(CountVal, dl); unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; @@ -279,11 +280,13 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, EVT SizeVT = Size.getValueType(); Results.push_back(DAG.getMemcpy(Chain, dl, DAG.getNode(ISD::ADD, dl, DstVT, Dst, - DAG.getConstant(Offset, DstVT)), + DAG.getConstant(Offset, dl, + DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, - DAG.getConstant(Offset, SrcVT)), - DAG.getConstant(BytesLeft, SizeVT), - Align, isVolatile, AlwaysInline, + DAG.getConstant(Offset, dl, + SrcVT)), + DAG.getConstant(BytesLeft, dl, SizeVT), + Align, isVolatile, AlwaysInline, false, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); } diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 9831698..1cdab14 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -257,10 +257,8 @@ void X86Subtarget::initializeEnvironment() { HasVLX = false; HasADX = false; HasSHA = false; - HasSGX = false; HasPRFCHW = false; HasRDSEED = false; - HasSMAP = false; IsBTMemSlow = false; IsSHLDSlow = false; IsUAMemFast = false; @@ -280,46 +278,7 @@ void X86Subtarget::initializeEnvironment() { stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; -} - -static std::string computeDataLayout(const Triple &TT) { - // X86 is little endian - std::string Ret = "e"; - - Ret += DataLayout::getManglingComponent(TT); - // X86 and x32 have 32 bit pointers. - if ((TT.isArch64Bit() && - (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || - !TT.isArch64Bit()) - Ret += "-p:32:32"; - - // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; - - // Some ABIs align long double to 128 bits, others to 32. - if (TT.isOSNaCl()) - ; // No f80 - else if (TT.isArch64Bit() || TT.isOSDarwin()) - Ret += "-f80:128"; - else - Ret += "-f80:32"; - - // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (TT.isArch64Bit()) - Ret += "-n8:16:32:64"; - else - Ret += "-n8:16:32"; - - // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!TT.isArch64Bit() && TT.isOSWindows()) - Ret += "-S32"; - else - Ret += "-S128"; - - return Ret; + UseSoftFloat = false; } X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, @@ -334,16 +293,16 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), - DL(computeDataLayout(TargetTriple)), StackAlignOverride(StackAlignOverride), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown, - getStackAlignment(), is64Bit() ? -8 : -4) { + TSInfo(*TM.getDataLayout()), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), + FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), + is64Bit() ? -8 : -4) { // Determine the PICStyle based on the target selected. if (TM.getRelocationModel() == Reloc::Static) { // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index e0263d6..455dd77 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -31,7 +31,7 @@ class GlobalValue; class StringRef; class TargetMachine; -/// PICStyles - The X86 backend supports a number of different styles of PIC. +/// The X86 backend supports a number of different styles of PIC. /// namespace PICStyles { enum Style { @@ -58,105 +58,95 @@ protected: Others, IntelAtom, IntelSLM }; - /// X86ProcFamily - X86 processor family: Intel Atom, and others + /// X86 processor family: Intel Atom, and others X86ProcFamilyEnum X86ProcFamily; - /// PICStyle - Which PIC style to use - /// + /// Which PIC style to use PICStyles::Style PICStyle; - /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or - /// none supported. + /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel; - /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported. - /// + /// 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; - /// HasCMov - True if this processor has conditional move instructions + /// True if this processor has conditional move instructions /// (generally pentium pro+). bool HasCMov; - /// HasX86_64 - True if the processor supports X86-64 instructions. - /// + /// True if the processor supports X86-64 instructions. bool HasX86_64; - /// HasPOPCNT - True if the processor supports POPCNT. + /// True if the processor supports POPCNT. bool HasPOPCNT; - /// HasSSE4A - True if the processor supports SSE4A instructions. + /// True if the processor supports SSE4A instructions. bool HasSSE4A; - /// HasAES - Target has AES instructions + /// Target has AES instructions bool HasAES; - /// HasPCLMUL - Target has carry-less multiplication + /// Target has carry-less multiplication bool HasPCLMUL; - /// HasFMA - Target has 3-operand fused multiply-add + /// Target has 3-operand fused multiply-add bool HasFMA; - /// HasFMA4 - Target has 4-operand fused multiply-add + /// Target has 4-operand fused multiply-add bool HasFMA4; - /// HasXOP - Target has XOP instructions + /// Target has XOP instructions bool HasXOP; - /// HasTBM - Target has TBM instructions. + /// Target has TBM instructions. bool HasTBM; - /// HasMOVBE - True if the processor has the MOVBE instruction. + /// True if the processor has the MOVBE instruction. bool HasMOVBE; - /// HasRDRAND - True if the processor has the RDRAND instruction. + /// True if the processor has the RDRAND instruction. bool HasRDRAND; - /// HasF16C - Processor has 16-bit floating point conversion instructions. + /// Processor has 16-bit floating point conversion instructions. bool HasF16C; - /// HasFSGSBase - Processor has FS/GS base insturctions. + /// Processor has FS/GS base insturctions. bool HasFSGSBase; - /// HasLZCNT - Processor has LZCNT instruction. + /// Processor has LZCNT instruction. bool HasLZCNT; - /// HasBMI - Processor has BMI1 instructions. + /// Processor has BMI1 instructions. bool HasBMI; - /// HasBMI2 - Processor has BMI2 instructions. + /// Processor has BMI2 instructions. bool HasBMI2; - /// HasRTM - Processor has RTM instructions. + /// Processor has RTM instructions. bool HasRTM; - /// HasHLE - Processor has HLE. + /// Processor has HLE. bool HasHLE; - /// HasADX - Processor has ADX instructions. + /// Processor has ADX instructions. bool HasADX; - /// HasSHA - Processor has SHA instructions. + /// Processor has SHA instructions. bool HasSHA; - /// HasSGX - Processor has SGX instructions. - bool HasSGX; - - /// HasPRFCHW - Processor has PRFCHW instructions. + /// Processor has PRFCHW instructions. bool HasPRFCHW; - /// HasRDSEED - Processor has RDSEED instructions. + /// Processor has RDSEED instructions. bool HasRDSEED; - /// HasSMAP - Processor has SMAP instructions. - bool HasSMAP; - - /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; - /// IsSHLDSlow - True if SHLD instructions are slow. + /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// IsUAMemFast - True if unaligned memory access is fast. + /// True if unaligned memory access is fast. bool IsUAMemFast; /// True if unaligned 32-byte memory accesses are slow. @@ -166,37 +156,38 @@ protected: /// This may require setting a configuration bit in the processor. bool HasSSEUnalignedMem; - /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; + /// True if this processor has the CMPXCHG16B instruction; /// this is true for most x86-64 chips, but not the first AMD chips. bool HasCmpxchg16b; - /// UseLeaForSP - True if the LEA instruction should be used for adjusting + /// True if the LEA instruction should be used for adjusting /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; - /// HasSlowDivide32 - True if 8-bit divisions are significantly faster than + /// True if 8-bit divisions are significantly faster than /// 32-bit divisions and should be used when possible. bool HasSlowDivide32; - /// HasSlowDivide64 - True if 16-bit divides are significantly faster than + /// True if 16-bit divides are significantly faster than /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; - /// PadShortFunctions - True if the short functions should be padded to prevent + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; - /// CallRegIndirect - True if the Calls with memory reference should be converted + /// True if the Calls with memory reference should be converted /// to a register-based indirect call. bool CallRegIndirect; - /// LEAUsesAG - True if the LEA instruction inputs have to be ready at - /// address generation (AG) time. + + /// True if the LEA instruction inputs have to be ready at address generation + /// (AG) time. bool LEAUsesAG; - /// SlowLEA - True if the LEA instruction with certain arguments is slow + /// True if the LEA instruction with certain arguments is slow bool SlowLEA; - /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; /// Use the RSQRT* instructions to optimize square root calculations. @@ -227,7 +218,10 @@ protected: /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX; - /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// Use software floating point for code generation. + bool UseSoftFloat; + + /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -235,26 +229,24 @@ protected: /// unsigned MaxInlineSizeThreshold; - /// TargetTriple - What processor and OS we're targeting. + /// What processor and OS we're targeting. Triple TargetTriple; /// Instruction itineraries for scheduling InstrItineraryData InstrItins; private: - // Calculates type size & alignment - const DataLayout DL; - /// StackAlignOverride - Override the stack alignment. + /// Override the stack alignment. unsigned StackAlignOverride; - /// In64BitMode - True if compiling for 64-bit, false for 16-bit or 32-bit. + /// True if compiling for 64-bit, false for 16-bit or 32-bit. bool In64BitMode; - /// In32BitMode - True if compiling for 32-bit, false for 16-bit or 64-bit. + /// True if compiling for 32-bit, false for 16-bit or 64-bit. bool In32BitMode; - /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. + /// True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; X86SelectionDAGInfo TSInfo; @@ -276,7 +268,6 @@ public: return &TLInfo; } const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const DataLayout *getDataLayout() const override { return &DL; } const X86FrameLowering *getFrameLowering() const override { return &FrameLowering; } @@ -287,12 +278,12 @@ public: return &getInstrInfo()->getRegisterInfo(); } - /// getStackAlignment - Returns the minimum alignment known to hold of the + /// Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. unsigned getStackAlignment() const { return stackAlignment; } - /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } @@ -301,7 +292,7 @@ public: void ParseSubtargetFeatures(StringRef CPU, StringRef FS); private: - /// \brief Initialize the full set of dependencies so we can use an initializer + /// Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); @@ -370,10 +361,8 @@ public: bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } - bool hasSGX() const { return HasSGX; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } - bool hasSMAP() const { return HasSMAP; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } @@ -399,6 +388,7 @@ public: bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } + bool useSoftFloat() const { return UseSoftFloat; } const Triple &getTargetTriple() const { return TargetTriple; } @@ -406,6 +396,7 @@ public: bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } + bool isTargetPS4() const { return TargetTriple.isPS4(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } @@ -475,13 +466,11 @@ public: unsigned char ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM)const; - /// ClassifyBlockAddressReference - Classify a blockaddress reference for the - /// current subtarget according to how we should reference it in a non-pcrel - /// context. + /// Classify a blockaddress reference for the current subtarget according to + /// how we should reference it in a non-pcrel context. unsigned char ClassifyBlockAddressReference() const; - /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls - /// to immediate address. + /// Return true if the subtarget allows calls to immediate address. bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const; /// This function returns the name of a function which has an interface @@ -500,8 +489,7 @@ public: bool enableEarlyIfConversion() const override; - /// getInstrItins = Return the instruction itineraries based on the - /// subtarget selection. + /// Return the instruction itineraries based on the subtarget selection. const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 1fc6b20..3e5f1d8 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -14,9 +14,10 @@ #include "X86TargetMachine.h" #include "X86.h" #include "X86TargetObjectFile.h" +#include "X86TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" -#include "llvm/PassManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" @@ -36,10 +37,10 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<TargetLoweringObjectFileMachO>(); } - if (TT.isOSLinux()) - return make_unique<X86LinuxTargetObjectFile>(); + if (TT.isOSLinux() || TT.isOSNaCl()) + return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return make_unique<TargetLoweringObjectFileELF>(); + return make_unique<X86ELFTargetObjectFile>(); if (TT.isKnownWindowsMSVCEnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) @@ -47,19 +48,56 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { llvm_unreachable("unknown subtarget type"); } +static std::string computeDataLayout(const Triple &TT) { + // X86 is little endian + std::string Ret = "e"; + + Ret += DataLayout::getManglingComponent(TT); + // X86 and x32 have 32 bit pointers. + if ((TT.isArch64Bit() && + (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || + !TT.isArch64Bit()) + Ret += "-p:32:32"; + + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. + if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // Some ABIs align long double to 128 bits, others to 32. + if (TT.isOSNaCl()) + ; // No f80 + else if (TT.isArch64Bit() || TT.isOSDarwin()) + Ret += "-f80:128"; + else + Ret += "-f80:32"; + + // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. + if (TT.isArch64Bit()) + Ret += "-n8:16:32:64"; + else + Ret += "-n8:16:32"; + + // The stack is aligned to 32 bits on some ABIs and 128 bits on others. + if (!TT.isArch64Bit() && TT.isOSWindows()) + Ret += "-a:0:32-S32"; + else + Ret += "-S128"; + + return Ret; +} + /// X86TargetMachine ctor - Create an X86 target. /// X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + : LLVMTargetMachine(T, computeDataLayout(Triple(TT)), TT, CPU, FS, Options, + RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { - // default to hard float ABI - if (Options.FloatABIType == FloatABI::Default) - this->Options.FloatABIType = FloatABI::Hard; - // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. // To prevent that, we emit a trap for 'unreachable' IR instructions. @@ -74,11 +112,8 @@ X86TargetMachine::~X86TargetMachine() {} const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { - AttributeSet FnAttrs = F.getAttributes(); - Attribute CPUAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); - Attribute FSAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); std::string CPU = !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString().str() @@ -92,14 +127,15 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // function before we can generate a subtarget. We also need to use // it as a key for the subtarget since that can be the only difference // between two functions. - Attribute SFAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float"); - bool SoftFloat = !SFAttr.hasAttribute(Attribute::None) - ? SFAttr.getValueAsString() == "true" - : Options.UseSoftFloat; - - auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true" - : "use-soft-float=false")]; + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + + auto &I = SubtargetMap[CPU + FS]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the @@ -120,15 +156,12 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, cl::init(true)); //===----------------------------------------------------------------------===// -// X86 Analysis Pass Setup +// X86 TTI query. //===----------------------------------------------------------------------===// -void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our X86 pass. This - // allows the X86 pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createX86TargetTransformInfoPass(this)); +TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); }); } @@ -147,16 +180,14 @@ public: return getTM<X86TargetMachine>(); } - const X86Subtarget &getX86Subtarget() const { - return *getX86TargetMachine().getSubtargetImpl(); - } - void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; + bool addPreISel() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; + void addPreSched2() override; }; } // namespace @@ -175,7 +206,8 @@ bool X86PassConfig::addInstSelector() { addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); // For ELF, cleanup any local-dynamic TLS accesses. - if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) + if (Triple(TM->getTargetTriple()).isOSBinFormatELF() && + getOptLevel() != CodeGenOpt::None) addPass(createCleanupLocalDynamicTLSPass()); addPass(createX86GlobalBaseRegPass()); @@ -188,6 +220,14 @@ bool X86PassConfig::addILPOpts() { return true; } +bool X86PassConfig::addPreISel() { + // Only add this pass for 32-bit x86. + Triple TT(TM->getTargetTriple()); + if (TT.getArch() == Triple::x86) + addPass(createX86WinEHStatePass()); + return true; +} + void X86PassConfig::addPreRegAlloc() { addPass(createX86CallFrameOptimization()); } @@ -196,8 +236,10 @@ void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); } +void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } + void X86PassConfig::addPreEmitPass() { - if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) + if (getOptLevel() != CodeGenOpt::None) addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); if (UseVZeroUpper) diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index 916278c..c9833ed 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -24,22 +24,18 @@ class StringRef; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - X86Subtarget Subtarget; + X86Subtarget Subtarget; mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; public: - X86TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; - - const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } const X86Subtarget *getSubtargetImpl(const Function &F) const override; - /// \brief Register X86 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index f8bcd61..6bf45c3 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -46,18 +47,31 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( return TM.getSymbol(GV, Mang); } -void -X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); +const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { + // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry + // from a data section. In case there's an additional offset, then use + // foo@GOTPCREL+4+<offset>. + unsigned FinalOff = Offset+MV.getConstant()+4; + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Off = MCConstantExpr::Create(FinalOff, getContext()); + return MCBinaryExpr::CreateAdd(Res, Off, getContext()); } -const MCExpr * -X86LinuxTargetObjectFile::getDebugThreadLocalSymbol( +const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( const MCSymbol *Sym) const { return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); } +void +X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const { // We are looking for the difference of two symbols, need a subtraction @@ -81,14 +95,12 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( SubRHS->getPointerAddressSpace() != 0) return nullptr; - // Both ptrtoint instructions must wrap global variables: + // Both ptrtoint instructions must wrap global objects: // - Only global variables are eligible for image relative relocations. - // - The subtrahend refers to the special symbol __ImageBase, a global. - const GlobalVariable *GVLHS = - dyn_cast<GlobalVariable>(SubLHS->getPointerOperand()); - const GlobalVariable *GVRHS = - dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); - if (!GVLHS || !GVRHS) + // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable. + const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand()); + const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); + if (!GOLHS || !GVRHS) return nullptr; // We expect __ImageBase to be a global variable without a section, externally @@ -101,10 +113,10 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( return nullptr; // An image-relative, thread-local, symbol makes no sense. - if (GVLHS->isThreadLocal()) + if (GOLHS->isThreadLocal()) return nullptr; - return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang), + return MCSymbolRefExpr::Create(TM.getSymbol(GOLHS, Mang), MCSymbolRefExpr::VK_COFF_IMGREL32, getContext()); } @@ -137,7 +149,7 @@ static std::string scalarConstantToHexString(const Constant *C) { return APIntToHexString(AI); } -const MCSection * +MCSection * X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, const Constant *C) const { if (Kind.isReadOnly()) { diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 6a6988a..66366b2 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -30,17 +30,26 @@ namespace llvm { MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, MachineModuleInfo *MMI) const override; - }; - /// X86LinuxTargetObjectFile - This implementation is used for linux x86 - /// and x86-64. - class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + }; + /// \brief This implemenatation is used for X86 ELF targets that don't + /// have a further specialization. + class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF { /// \brief Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; + /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and + /// Native Client on x86 and x86-64. + class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + /// \brief This implementation is used for Windows targets on x86 and x86-64. class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { const MCExpr * @@ -49,8 +58,8 @@ namespace llvm { /// \brief Given a mergeable constant with the specified size and relocation /// information, return a section that it should be placed in. - const MCSection *getSectionForConstant(SectionKind Kind, - const Constant *C) const override; + MCSection *getSectionForConstant(SectionKind Kind, + const Constant *C) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 67488f7..bbfeba8 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -14,9 +14,9 @@ /// //===----------------------------------------------------------------------===// -#include "X86.h" -#include "X86TargetMachine.h" +#include "X86TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" @@ -25,125 +25,22 @@ using namespace llvm; #define DEBUG_TYPE "x86tti" -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeX86TTIPass(PassRegistry &); -} - -namespace { - -class X86TTI final : public ImmutablePass, public TargetTransformInfo { - const X86Subtarget *ST; - const X86TargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), ST(TM->getSubtargetImpl()), - TLI(TM->getSubtargetImpl()->getTargetLowering()) { - initializeX86TTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { - pushTTIStack(this); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo*)this; - return this; - } - - /// \name Scalar TTI Implementations - /// @{ - PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - - /// @} - - /// \name Vector TTI Implementations - /// @{ - - unsigned getNumberOfRegisters(bool Vector) const override; - unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaxInterleaveFactor() const override; - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind, OperandValueProperties, - OperandValueProperties) const override; - unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) const override; - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const override; - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const override; - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const override; - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const override; - - unsigned getAddressComputationCost(Type *PtrTy, - bool IsComplex) const override; - - unsigned getReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm) const override; - - unsigned getIntImmCost(int64_t) const; - - unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; - - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - bool isLegalMaskedLoad (Type *DataType, int Consecutive) const override; - bool isLegalMaskedStore(Type *DataType, int Consecutive) const override; - - /// @} -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", - "X86 Target Transform Info", true, true, false) -char X86TTI::ID = 0; - -ImmutablePass * -llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { - return new X86TTI(TM); -} - - //===----------------------------------------------------------------------===// // // X86 cost model. // //===----------------------------------------------------------------------===// -X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { +TargetTransformInfo::PopcntSupportKind +X86TTIImpl::getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); // TODO: Currently the __builtin_popcount() implementation using SSE3 // instructions is inefficient. Once the problem is fixed, we should // call ST->hasSSE3() instead of ST->hasPOPCNT(). - return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; + return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; } -unsigned X86TTI::getNumberOfRegisters(bool Vector) const { +unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasSSE1()) return 0; @@ -155,7 +52,7 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const { return 8; } -unsigned X86TTI::getRegisterBitWidth(bool Vector) const { +unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (Vector) { if (ST->hasAVX512()) return 512; if (ST->hasAVX()) return 256; @@ -169,7 +66,13 @@ unsigned X86TTI::getRegisterBitWidth(bool Vector) const { } -unsigned X86TTI::getMaxInterleaveFactor() const { +unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll to unroll the loop, which saves the overflow + // check and memory check cost. + if (VF == 1) + return 1; + if (ST->isAtom()) return 1; @@ -181,10 +84,10 @@ unsigned X86TTI::getMaxInterleaveFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, OperandValueKind Op1Info, - OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, - OperandValueProperties Opd2PropInfo) const { +unsigned X86TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); @@ -250,15 +153,15 @@ unsigned X86TTI::getArithmeticInstrCost( { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, - { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. - { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. + { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. + { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. - { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. + { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. - { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. + { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, @@ -439,17 +342,16 @@ unsigned X86TTI::getArithmeticInstrCost( return LT.first * 6; // Fallback to the default implementation. - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, - Op2Info); + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } -unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) const { +unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only estimate the cost of reverse and alternate shuffles. - if (Kind != SK_Reverse && Kind != SK_Alternate) - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - if (Kind == SK_Reverse) { + if (Kind == TTI::SK_Reverse) { std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); unsigned Cost = 1; if (LT.second.getSizeInBits() > 128) @@ -459,7 +361,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, return Cost * LT.first; } - if (Kind == SK_Alternate) { + if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); @@ -552,13 +454,13 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx != -1) return LT.first * SSEAltShuffleTbl[Idx].Cost; - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } - return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { +unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -640,7 +542,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); static const TypeConversionCostTblEntry<MVT::SimpleValueType> AVX2ConversionTbl[] = { @@ -759,11 +661,11 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return AVXConversionTbl[Idx].Cost; } - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { +unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); @@ -829,11 +731,11 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return LT.first * SSE42CostTbl[Idx].Cost; } - return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { +unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { @@ -853,26 +755,27 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return 0; } - return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); + return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, - bool Extract) const { +unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, + bool Extract) { assert (Ty->isVectorTy() && "Can only scalarize vectors"); unsigned Cost = 0; for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { if (Insert) - Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); if (Extract) - Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); } return Cost; } -unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const { +unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -890,10 +793,8 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { - unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, - VTy->getScalarType(), - Alignment, - AddressSpace); + unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), + Alignment, AddressSpace); unsigned SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, Opcode==Instruction::Store); @@ -917,7 +818,60 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } -unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { +unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); + if (!SrcVTy) + // To calculate scalar take the regular cost, without mask + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + + unsigned NumElem = SrcVTy->getVectorNumElements(); + VectorType *MaskTy = + VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + !isPowerOf2_32(NumElem)) { + // Scalarization + unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + unsigned ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, + Type::getInt8Ty(getGlobalContext()), NULL); + unsigned BranchCost = getCFInstrCost(Instruction::Br); + unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + unsigned ValueSplitCost = + getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, + Opcode == Instruction::Store); + unsigned MemopCost = + NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; + } + + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); + unsigned Cost = 0; + if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && + LT.second.getVectorNumElements() == NumElem) + // Promotion requires expand/truncate for data and a shuffle for mask. + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); + + else if (LT.second.getVectorNumElements() > NumElem) { + VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), + LT.second.getVectorNumElements()); + // Expanding requires fill mask with zeroes + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + } + if (!ST->hasAVX512()) + return Cost + LT.first*4; // Each maskmov costs 4 + + // AVX-512 masked load/store is cheapper + return Cost+LT.first; +} + +unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -927,11 +881,11 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { if (Ty->isVectorTy() && IsComplex) return NumVectorInstToHideOverhead; - return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); + return BaseT::getAddressComputationCost(Ty, IsComplex); } -unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) const { +unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); @@ -1007,23 +961,23 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, } } - return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); + return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); } /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned X86TTI::getIntImmCost(int64_t Val) const { +unsigned X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) - return TCC_Free; + return TTI::TCC_Free; if (isInt<32>(Val)) - return TCC_Basic; + return TTI::TCC_Basic; - return 2 * TCC_Basic; + return 2 * TTI::TCC_Basic; } -unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1035,10 +989,10 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { // Fixme: Create a cost model for types larger than i128 once the codegen // issues have been fixed. if (BitSize > 128) - return TCC_Free; + return TTI::TCC_Free; if (Imm == 0) - return TCC_Free; + return TTI::TCC_Free; // Sign-extend all constants to a multiple of 64-bit. APInt ImmVal = Imm; @@ -1057,26 +1011,27 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { return std::max(1U, Cost); } -unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; unsigned ImmIdx = ~0U; switch (Opcode) { - default: return TCC_Free; + default: + return TTI::TCC_Free; case Instruction::GetElementPtr: // Always hoist the base address of a GetElementPtr. This prevents the // creation of new constants for every base constant that gets constant // folded with the offset. if (Idx == 0) - return 2 * TCC_Basic; - return TCC_Free; + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; case Instruction::Store: ImmIdx = 0; break; @@ -1098,7 +1053,7 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, case Instruction::LShr: case Instruction::AShr: if (Idx == 1) - return TCC_Free; + return TTI::TCC_Free; break; case Instruction::Trunc: case Instruction::ZExt: @@ -1116,27 +1071,28 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, if (Idx == ImmIdx) { unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = X86TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast<unsigned>(TCC_Free) - : Cost; + unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<unsigned>(TTI::TCC_Free) + : Cost; } - return X86TTI::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty); } -unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) const { +unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); // There is no cost model for constants with a bit size of 0. Return TCC_Free // here, so that constant hoisting will ignore this constant. if (BitSize == 0) - return TCC_Free; + return TTI::TCC_Free; switch (IID) { - default: return TCC_Free; + default: + return TTI::TCC_Free; case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: @@ -1144,22 +1100,22 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) - return TCC_Free; + return TTI::TCC_Free; break; case Intrinsic::experimental_stackmap: if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; + return TTI::TCC_Free; break; } - return X86TTI::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty); } -bool X86TTI::isLegalMaskedLoad(Type *DataTy, int Consecutive) const { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { int DataWidth = DataTy->getPrimitiveSizeInBits(); // Todo: AVX512 allows gather/scatter, works with strided and random as well @@ -1170,7 +1126,7 @@ bool X86TTI::isLegalMaskedLoad(Type *DataTy, int Consecutive) const { return false; } -bool X86TTI::isLegalMaskedStore(Type *DataType, int Consecutive) const { +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { return isLegalMaskedLoad(DataType, Consecutive); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h new file mode 100644 index 0000000..e570bb5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -0,0 +1,112 @@ +//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// X86 target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { + typedef BasicTTIImplBase<X86TTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const X86Subtarget *ST; + const X86TargetLowering *TLI; + + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const X86Subtarget *getST() const { return ST; } + const X86TargetLowering *getTLI() const { return TLI; } + +public: + explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) + : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + X86TTIImpl(const X86TTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + X86TTIImpl(X86TTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + X86TTIImpl &operator=(const X86TTIImpl &RHS) { + BaseT::operator=(static_cast<const BaseT &>(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + X86TTIImpl &operator=(X86TTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast<BaseT &>(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + /// \name Scalar TTI Implementations + /// @{ + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp); + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + + unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex); + + unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + unsigned getIntImmCost(int64_t); + + unsigned getIntImmCost(const APInt &Imm, Type *Ty); + + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty); + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType, int Consecutive); + bool isLegalMaskedStore(Type *DataType, int Consecutive); + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 5c01f3e1..6925b27 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -9,7 +9,7 @@ // // This file defines the pass which inserts x86 AVX vzeroupper instructions // before calls to SSE encoded functions. This avoids transition latency -// penalty when tranfering control between AVX encoded instructions and old +// penalty when transferring control between AVX encoded instructions and old // SSE encoding mode. // //===----------------------------------------------------------------------===// @@ -171,7 +171,7 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { } /// processBasicBlock - Loop over all of the instructions in the basic block, -/// inserting vzero upper instructions before function calls. +/// inserting vzeroupper instructions before function calls. void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // Start by assuming that the block PASS_THROUGH, which implies no unguarded @@ -202,7 +202,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // If the call won't clobber any YMM register, skip it as well. It usually // happens on helper function calls (such as '_chkstk', '_ftol2') where // standard calling convention is not used (RegMask is not used to mark - // register clobbered and register usage (def/imp-def/use) is well-dfined + // register clobbered and register usage (def/imp-def/use) is well-defined // and explicitly specified. if (MI->isCall() && !callClobbersAnyYmmReg(MI)) continue; @@ -245,12 +245,12 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { } /// runOnMachineFunction - Loop over all of the basic blocks, inserting -/// vzero upper instructions before function calls. +/// vzeroupper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { - const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); if (!ST.hasAVX() || ST.hasAVX512()) return false; - TII = MF.getSubtarget().getInstrInfo(); + TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; @@ -281,8 +281,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // Process all blocks. This will compute block exit states, record the first // unguarded call in each block, and add successors of dirty blocks to the // DirtySuccessors list. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - processBasicBlock(*I); + for (MachineBasicBlock &MBB : MF) + processBasicBlock(MBB); // If any YMM regs are live in to this function, add the entry block to the // DirtySuccessors list diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp new file mode 100644 index 0000000..4efaada --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -0,0 +1,374 @@ +//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// All functions using an MSVC EH personality use an explicitly updated state +// number stored in an exception registration stack object. The registration +// object is linked into a thread-local chain of registrations stored at fs:00. +// This pass adds the registration object and EH state updates. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "winehstate" + +namespace { +class WinEHStatePass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid. + + WinEHStatePass() : FunctionPass(ID) {} + + bool runOnFunction(Function &Fn) override; + + bool doInitialization(Module &M) override; + + bool doFinalization(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + const char *getPassName() const override { + return "Windows 32-bit x86 EH state insertion"; + } + +private: + void emitExceptionRegistrationRecord(Function *F); + + void linkExceptionRegistration(IRBuilder<> &Builder, Value *RegNode, + Value *Handler); + void unlinkExceptionRegistration(IRBuilder<> &Builder, Value *RegNode); + + Value *emitEHLSDA(IRBuilder<> &Builder, Function *F); + + Function *generateLSDAInEAXThunk(Function *ParentFunc); + + // Module-level type getters. + Type *getEHRegistrationType(); + Type *getSEH3RegistrationType(); + Type *getSEH4RegistrationType(); + Type *getCXXEH3RegistrationType(); + + // Per-module data. + Module *TheModule = nullptr; + StructType *EHRegistrationTy = nullptr; + StructType *CXXEH3RegistrationTy = nullptr; + StructType *SEH3RegistrationTy = nullptr; + StructType *SEH4RegistrationTy = nullptr; + + // Per-function state + EHPersonality Personality = EHPersonality::Unknown; + Function *PersonalityFn = nullptr; +}; +} + +FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } + +char WinEHStatePass::ID = 0; + +bool WinEHStatePass::doInitialization(Module &M) { + TheModule = &M; + return false; +} + +bool WinEHStatePass::doFinalization(Module &M) { + assert(TheModule == &M); + TheModule = nullptr; + EHRegistrationTy = nullptr; + CXXEH3RegistrationTy = nullptr; + SEH3RegistrationTy = nullptr; + SEH4RegistrationTy = nullptr; + return false; +} + +void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { + // This pass should only insert a stack allocation, memory accesses, and + // framerecovers. + AU.setPreservesCFG(); +} + +bool WinEHStatePass::runOnFunction(Function &F) { + // If this is an outlined handler, don't do anything. We'll do state insertion + // for it in the parent. + StringRef WinEHParentName = + F.getFnAttribute("wineh-parent").getValueAsString(); + if (WinEHParentName != F.getName() && !WinEHParentName.empty()) + return false; + + // Check the personality. Do nothing if this is not an MSVC personality. + LandingPadInst *LP = nullptr; + for (BasicBlock &BB : F) { + LP = BB.getLandingPadInst(); + if (LP) + break; + } + if (!LP) + return false; + PersonalityFn = + dyn_cast<Function>(LP->getPersonalityFn()->stripPointerCasts()); + if (!PersonalityFn) + return false; + Personality = classifyEHPersonality(PersonalityFn); + if (!isMSVCEHPersonality(Personality)) + return false; + + emitExceptionRegistrationRecord(&F); + // FIXME: State insertion. + + // Reset per-function state. + PersonalityFn = nullptr; + Personality = EHPersonality::Unknown; + return true; +} + +/// Get the common EH registration subobject: +/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)( +/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *); +/// struct EHRegistrationNode { +/// EHRegistrationNode *Next; +/// PEXCEPTION_ROUTINE Handler; +/// }; +Type *WinEHStatePass::getEHRegistrationType() { + if (EHRegistrationTy) + return EHRegistrationTy; + LLVMContext &Context = TheModule->getContext(); + EHRegistrationTy = StructType::create(Context, "EHRegistrationNode"); + Type *FieldTys[] = { + EHRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next + Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...) + }; + EHRegistrationTy->setBody(FieldTys, false); + return EHRegistrationTy; +} + +/// The __CxxFrameHandler3 registration node: +/// struct CXXExceptionRegistration { +/// void *SavedESP; +/// EHRegistrationNode SubRecord; +/// int32_t TryLevel; +/// }; +Type *WinEHStatePass::getCXXEH3RegistrationType() { + if (CXXEH3RegistrationTy) + return CXXEH3RegistrationTy; + LLVMContext &Context = TheModule->getContext(); + Type *FieldTys[] = { + Type::getInt8PtrTy(Context), // void *SavedESP + getEHRegistrationType(), // EHRegistrationNode SubRecord + Type::getInt32Ty(Context) // int32_t TryLevel + }; + CXXEH3RegistrationTy = + StructType::create(FieldTys, "CXXExceptionRegistration"); + return CXXEH3RegistrationTy; +} + +/// The _except_handler3 registration node: +/// struct EH3ExceptionRegistration { +/// EHRegistrationNode SubRecord; +/// void *ScopeTable; +/// int32_t TryLevel; +/// }; +Type *WinEHStatePass::getSEH3RegistrationType() { + if (SEH3RegistrationTy) + return SEH3RegistrationTy; + LLVMContext &Context = TheModule->getContext(); + Type *FieldTys[] = { + getEHRegistrationType(), // EHRegistrationNode SubRecord + Type::getInt8PtrTy(Context), // void *ScopeTable + Type::getInt32Ty(Context) // int32_t TryLevel + }; + SEH3RegistrationTy = StructType::create(FieldTys, "EH3ExceptionRegistration"); + return SEH3RegistrationTy; +} + +/// The _except_handler4 registration node: +/// struct EH4ExceptionRegistration { +/// void *SavedESP; +/// _EXCEPTION_POINTERS *ExceptionPointers; +/// EHRegistrationNode SubRecord; +/// int32_t EncodedScopeTable; +/// int32_t TryLevel; +/// }; +Type *WinEHStatePass::getSEH4RegistrationType() { + if (SEH4RegistrationTy) + return SEH4RegistrationTy; + LLVMContext &Context = TheModule->getContext(); + Type *FieldTys[] = { + Type::getInt8PtrTy(Context), // void *SavedESP + Type::getInt8PtrTy(Context), // void *ExceptionPointers + getEHRegistrationType(), // EHRegistrationNode SubRecord + Type::getInt32Ty(Context), // int32_t EncodedScopeTable + Type::getInt32Ty(Context) // int32_t TryLevel + }; + SEH4RegistrationTy = StructType::create(FieldTys, "EH4ExceptionRegistration"); + return SEH4RegistrationTy; +} + +// Emit an exception registration record. These are stack allocations with the +// common subobject of two pointers: the previous registration record (the old +// fs:00) and the personality function for the current frame. The data before +// and after that is personality function specific. +void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { + assert(Personality == EHPersonality::MSVC_CXX || + Personality == EHPersonality::MSVC_X86SEH); + + StringRef PersonalityName = PersonalityFn->getName(); + IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin()); + Type *Int8PtrType = Builder.getInt8PtrTy(); + Value *SubRecord = nullptr; + if (PersonalityName == "__CxxFrameHandler3") { + Type *RegNodeTy = getCXXEH3RegistrationType(); + Value *RegNode = Builder.CreateAlloca(RegNodeTy); + // FIXME: We can skip this in -GS- mode, when we figure that out. + // SavedESP = llvm.stacksave() + Value *SP = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {}); + Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + // TryLevel = -1 + Builder.CreateStore(Builder.getInt32(-1), + Builder.CreateStructGEP(RegNodeTy, RegNode, 2)); + // Handler = __ehhandler$F + Function *Trampoline = generateLSDAInEAXThunk(F); + SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 1); + linkExceptionRegistration(Builder, SubRecord, Trampoline); + } else if (PersonalityName == "_except_handler3") { + Type *RegNodeTy = getSEH3RegistrationType(); + Value *RegNode = Builder.CreateAlloca(RegNodeTy); + // TryLevel = -1 + Builder.CreateStore(Builder.getInt32(-1), + Builder.CreateStructGEP(RegNodeTy, RegNode, 2)); + // ScopeTable = llvm.x86.seh.lsda(F) + Value *LSDA = emitEHLSDA(Builder, F); + Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 1)); + SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 0); + linkExceptionRegistration(Builder, SubRecord, PersonalityFn); + } else if (PersonalityName == "_except_handler4") { + Type *RegNodeTy = getSEH4RegistrationType(); + Value *RegNode = Builder.CreateAlloca(RegNodeTy); + // SavedESP = llvm.stacksave() + Value *SP = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {}); + Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + // TryLevel = -2 + Builder.CreateStore(Builder.getInt32(-2), + Builder.CreateStructGEP(RegNodeTy, RegNode, 4)); + // FIXME: XOR the LSDA with __security_cookie. + // ScopeTable = llvm.x86.seh.lsda(F) + Value *FI8 = Builder.CreateBitCast(F, Int8PtrType); + Value *LSDA = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8); + Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 1)); + SubRecord = Builder.CreateStructGEP(RegNodeTy, RegNode, 2); + linkExceptionRegistration(Builder, SubRecord, PersonalityFn); + } else { + llvm_unreachable("unexpected personality function"); + } + + // FIXME: Insert an unlink before all returns. + for (BasicBlock &BB : *F) { + TerminatorInst *T = BB.getTerminator(); + if (!isa<ReturnInst>(T)) + continue; + Builder.SetInsertPoint(T); + unlinkExceptionRegistration(Builder, SubRecord); + } +} + +Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { + Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext())); + return Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8); +} + +/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls +/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE: +/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)( +/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *); +/// We essentially want this code: +/// movl $lsda, %eax +/// jmpl ___CxxFrameHandler3 +Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { + LLVMContext &Context = ParentFunc->getContext(); + Type *Int32Ty = Type::getInt32Ty(Context); + Type *Int8PtrType = Type::getInt8PtrTy(Context); + Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType, + Int8PtrType}; + FunctionType *TrampolineTy = + FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4), + /*isVarArg=*/false); + FunctionType *TargetFuncTy = + FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5), + /*isVarArg=*/false); + Function *Trampoline = Function::Create( + TrampolineTy, GlobalValue::InternalLinkage, + Twine("__ehhandler$") + ParentFunc->getName(), TheModule); + BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); + IRBuilder<> Builder(EntryBB); + Value *LSDA = emitEHLSDA(Builder, ParentFunc); + Value *CastPersonality = + Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); + auto AI = Trampoline->arg_begin(); + Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++}; + CallInst *Call = Builder.CreateCall(CastPersonality, Args); + // Can't use musttail due to prototype mismatch, but we can use tail. + Call->setTailCall(true); + // Set inreg so we pass it in EAX. + Call->addAttribute(1, Attribute::InReg); + Builder.CreateRet(Call); + return Trampoline; +} + +void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder, + Value *RegNode, Value *Handler) { + Type *RegNodeTy = getEHRegistrationType(); + // Handler = Handler + Handler = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy()); + Builder.CreateStore(Handler, Builder.CreateStructGEP(RegNodeTy, RegNode, 1)); + // Next = [fs:00] + Constant *FSZero = + Constant::getNullValue(RegNodeTy->getPointerTo()->getPointerTo(257)); + Value *Next = Builder.CreateLoad(FSZero); + Builder.CreateStore(Next, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + // [fs:00] = RegNode + Builder.CreateStore(RegNode, FSZero); +} + +void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder, + Value *RegNode) { + // Clone RegNode into the current BB for better address mode folding. + if (auto *GEP = dyn_cast<GetElementPtrInst>(RegNode)) { + GEP = cast<GetElementPtrInst>(GEP->clone()); + Builder.Insert(GEP); + RegNode = GEP; + } + Type *RegNodeTy = getEHRegistrationType(); + // [fs:00] = RegNode->Next + Value *Next = + Builder.CreateLoad(Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + Constant *FSZero = + Constant::getNullValue(RegNodeTy->getPointerTo()->getPointerTo(257)); + Builder.CreateStore(Next, FSZero); +} |