diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86ISelLowering.cpp | 6572 |
1 files changed, 4229 insertions, 2343 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index de6c9a6..957b46c40 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -52,7 +53,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include <algorithm> #include <bitset> @@ -70,6 +73,30 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); +static cl::opt<int> ExperimentalPrefLoopAlignment( + "x86-experimental-pref-loop-alignment", cl::init(4), + cl::desc("Sets the preferable loop alignment for experiments " + "(the last x86-experimental-pref-loop-alignment bits" + " of the loop header PC will be 0)."), + cl::Hidden); + +static cl::opt<bool> MulConstantOptimization( + "mul-constant-optimization", cl::init(true), + cl::desc("Replace 'mul x, Const' with more effective instructions like " + "SHIFT, LEA, etc."), + cl::Hidden); + +/// Call this when the user attempts to do something unsupported, like +/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike +/// report_fatal_error, so calling code should attempt to recover without +/// crashing. +static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, + const char *Msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc())); +} + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -290,16 +317,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); } - for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { - if (VT == MVT::i64 && !Subtarget.is64Bit()) - continue; - // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. - setOperationAction(ISD::ADDC, VT, Custom); - setOperationAction(ISD::ADDE, VT, Custom); - setOperationAction(ISD::SUBC, VT, Custom); - setOperationAction(ISD::SUBE, VT, Custom); - } - setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, @@ -401,8 +418,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SETCCE, VT, Custom); } + + // Custom action for SELECT MMX and expand action for SELECT_CC MMX + setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); + setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); + setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to @@ -427,7 +448,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } - // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + + // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; @@ -648,7 +670,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); @@ -775,29 +796,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v16i8, Legal); - setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - setOperationAction(ISD::SETCC, MVT::v16i8, Custom); - setOperationAction(ISD::SETCC, MVT::v8i16, Custom); - setOperationAction(ISD::SETCC, MVT::v4i32, Custom); - - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); - setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); - - setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); - setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); - setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); - setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + } - // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -872,22 +882,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - for (auto VT : { MVT::v8i16, MVT::v16i8 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - } - - // In the customized shift lowering, the legal cases in AVX2 will be - // recognized. - for (auto VT : { MVT::v4i32, MVT::v2i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); + // In the customized shift lowering, the legal v4i32/v2i64 cases + // in AVX2 will be recognized. + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); @@ -922,6 +929,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. + for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); + } + for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); @@ -929,19 +941,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); - - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { + setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); + } // i8 vectors are custom because the source register and source // source memory operand types are not the same width. @@ -1005,36 +1012,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); - for (auto VT : { MVT::v32i8, MVT::v16i16 }) { + // In the customized shift lowering, the legal v8i32/v4i64 cases + // in AVX2 will be recognized. + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); } - setOperationAction(ISD::SETCC, MVT::v32i8, Custom); - setOperationAction(ISD::SETCC, MVT::v16i16, Custom); - setOperationAction(ISD::SETCC, MVT::v8i32, Custom); - setOperationAction(ISD::SETCC, MVT::v4i64, Custom); - setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); + for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::SIGN_EXTEND, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND, VT, Custom); + setOperationAction(ISD::ANY_EXTEND, VT, Custom); + } + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { + setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); @@ -1065,6 +1067,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { + setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); @@ -1081,27 +1084,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X - setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); - - setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); - } - - // In the customized shift lowering, the legal cases in AVX2 will be - // recognized. - for (auto VT : { MVT::v8i32, MVT::v4i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); + for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { + setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); + } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, @@ -1126,7 +1116,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } @@ -1149,7 +1139,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); - addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); @@ -1164,16 +1154,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); } - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - setOperationAction(ISD::SETCC, MVT::i1, Custom); - setOperationAction(ISD::SETCCE, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::XOR, MVT::i1, Legal); - setOperationAction(ISD::OR, MVT::i1, Legal); - setOperationAction(ISD::AND, MVT::i1, Legal); - setOperationAction(ISD::SUB, MVT::i1, Custom); - setOperationAction(ISD::ADD, MVT::i1, Custom); - setOperationAction(ISD::MUL, MVT::i1, Custom); for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16, MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32, @@ -1242,27 +1222,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v8i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v16i1, Expand); - if (Subtarget.hasDQI()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + if (Subtarget.hasDQI()) { + for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) { + setOperationAction(ISD::SINT_TO_FP, VT, Legal); + setOperationAction(ISD::UINT_TO_FP, VT, Legal); + setOperationAction(ISD::FP_TO_SINT, VT, Legal); + setOperationAction(ISD::FP_TO_UINT, VT, Legal); + } if (Subtarget.hasVLX()) { // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); @@ -1296,8 +1265,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); } - setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -1310,11 +1277,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); } setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); @@ -1330,48 +1297,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); - setOperationAction(ISD::SETCC, MVT::v16i1, Custom); - setOperationAction(ISD::SETCC, MVT::v8i1, Custom); - - setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - setOperationAction(ISD::SELECT, MVT::v16i1, Custom); - setOperationAction(ISD::SELECT, MVT::v8i1, Custom); - - setOperationAction(ISD::SMAX, MVT::v16i32, Legal); - setOperationAction(ISD::SMAX, MVT::v8i64, Legal); - setOperationAction(ISD::UMAX, MVT::v16i32, Legal); - setOperationAction(ISD::UMAX, MVT::v8i64, Legal); - setOperationAction(ISD::SMIN, MVT::v16i32, Legal); - setOperationAction(ISD::SMIN, MVT::v8i64, Legal); - setOperationAction(ISD::UMIN, MVT::v16i32, Legal); - setOperationAction(ISD::UMIN, MVT::v8i64, Legal); - - setOperationAction(ISD::ADD, MVT::v8i1, Expand); - setOperationAction(ISD::ADD, MVT::v16i1, Expand); - setOperationAction(ISD::SUB, MVT::v8i1, Expand); - setOperationAction(ISD::SUB, MVT::v16i1, Expand); - setOperationAction(ISD::MUL, MVT::v8i1, Expand); - setOperationAction(ISD::MUL, MVT::v16i1, Expand); setOperationAction(ISD::MUL, MVT::v16i32, Legal); + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + setOperationAction(ISD::ABS, MVT::v4i64, Legal); + setOperationAction(ISD::ABS, MVT::v2i64, Legal); + + for (auto VT : { MVT::v8i1, MVT::v16i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + } + for (auto VT : { MVT::v16i32, MVT::v8i64 }) { - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::CTTZ, VT, Custom); + } + + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64, + MVT::v8i64}) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); } // Need to promote to 64-bit even though we have 32-bit masked instructions @@ -1382,33 +1355,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); if (Subtarget.hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - - setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); - setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); - setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); - setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); - - if (Subtarget.hasVLX()) { - setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); - setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); - setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); - } else { - setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); - setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); - setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, + MVT::v4i64, MVT::v8i64}) { + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } - - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); } // Subtarget.hasCDI() if (Subtarget.hasDQI()) { @@ -1418,6 +1370,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i64, Legal); } + if (Subtarget.hasVPOPCNTDQ()) { + // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512 + // version of popcntd/q. + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64, + MVT::v4i32, MVT::v2i64}) + setOperationAction(ISD::CTPOP, VT, Legal); + } + // Custom lower several nodes. for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { @@ -1428,7 +1388,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Custom under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v8f32, MVT::v4f64 }) + MVT::v8f32, MVT::v4f64, MVT::v1i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1, MVT::v32i1, MVT::v64i1 }) @@ -1438,10 +1398,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Legal); @@ -1460,12 +1420,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::ADD, MVT::v32i1, Expand); - setOperationAction(ISD::ADD, MVT::v64i1, Expand); - setOperationAction(ISD::SUB, MVT::v32i1, Expand); - setOperationAction(ISD::SUB, MVT::v64i1, Expand); - setOperationAction(ISD::MUL, MVT::v32i1, Expand); - setOperationAction(ISD::MUL, MVT::v64i1, Expand); + setOperationAction(ISD::ADD, MVT::v32i1, Custom); + setOperationAction(ISD::ADD, MVT::v64i1, Custom); + setOperationAction(ISD::SUB, MVT::v32i1, Custom); + setOperationAction(ISD::SUB, MVT::v64i1, Custom); + setOperationAction(ISD::MUL, MVT::v32i1, Custom); + setOperationAction(ISD::MUL, MVT::v64i1, Custom); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); @@ -1479,8 +1439,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); @@ -1502,8 +1462,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); @@ -1515,15 +1473,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v64i1, Expand); setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); - setOperationAction(ISD::SMAX, MVT::v64i8, Legal); - setOperationAction(ISD::SMAX, MVT::v32i16, Legal); - setOperationAction(ISD::UMAX, MVT::v64i8, Legal); - setOperationAction(ISD::UMAX, MVT::v32i16, Legal); - setOperationAction(ISD::SMIN, MVT::v64i8, Legal); - setOperationAction(ISD::SMIN, MVT::v32i16, Legal); - setOperationAction(ISD::UMIN, MVT::v64i8, Legal); - setOperationAction(ISD::UMIN, MVT::v32i16, Legal); - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); @@ -1545,7 +1494,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); @@ -1553,6 +1503,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); @@ -1574,9 +1528,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v2i1, &X86::VK2RegClass); for (auto VT : { MVT::v2i1, MVT::v4i1 }) { - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); @@ -1626,6 +1580,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); + + // Support carry in as value rather than glue. + setOperationAction(ISD::ADDCARRY, VT, Custom); + setOperationAction(ISD::SUBCARRY, VT, Custom); + setOperationAction(ISD::SETCCCARRY, VT, Custom); } if (!Subtarget.is64Bit()) { @@ -1671,6 +1630,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); @@ -1696,6 +1656,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -1712,7 +1674,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - setPrefLoopAlignment(4); // 2^4 bytes. + + // TODO: These control memcmp expansion in CGP and could be raised higher, but + // that needs to benchmarked and balanced with the potential use of vector + // load/store types (PR33329, PR33914). + MaxLoadsPerMemcmp = 2; + MaxLoadsPerMemcmpOptSize = 2; + + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). + setPrefLoopAlignment(ExperimentalPrefLoopAlignment); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. @@ -1742,7 +1712,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { if (!VT.isVector()) - return Subtarget.hasAVX512() ? MVT::i1: MVT::i8; + return MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); @@ -1933,6 +1903,34 @@ bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } +void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const { + + // Only relabel X86-32 for C / Stdcall CCs. + if (Subtarget.is64Bit()) + return; + if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) + return; + unsigned ParamRegs = 0; + if (auto *M = MF->getFunction()->getParent()) + ParamRegs = M->getNumberRegisterParameters(); + + // Mark the first N int arguments as having reg + for (unsigned Idx = 0; Idx < Args.size(); Idx++) { + Type *T = Args[Idx].Ty; + if (T->isPointerTy() || T->isIntegerTy()) + if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { + unsigned numRegs = 1; + if (MF->getDataLayout().getTypeAllocSize(T) > 4) + numRegs = 2; + if (ParamRegs < numRegs) + return; + ParamRegs -= numRegs; + Args[Idx].IsInReg = true; + } + } +} + const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, @@ -2001,21 +1999,37 @@ unsigned X86TargetLowering::getAddressSpace() const { return 256; } -Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - // glibc has a special slot for the stack guard in tcbhead_t, use it instead - // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h) - if (!Subtarget.isTargetGlibc()) - return TargetLowering::getIRStackGuard(IRB); - - // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x14 on i386 - unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; - unsigned AddressSpace = getAddressSpace(); +static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { + return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || + (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); +} + +static Constant* SegmentOffset(IRBuilder<> &IRB, + unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } +Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // glibc, bionic, and Fuchsia have a special slot for the stack guard in + // tcbhead_t; use it instead of the usual global variable (see + // sysdeps/{i386,x86_64}/nptl/tls.h) + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + return SegmentOffset(IRB, 0x10, getAddressSpace()); + } else { + // %fs:0x28, unless we're using a Kernel code model, in which case + // it's %gs:0x28. gs:0x14 on i386. + unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } + } + + return TargetLowering::getIRStackGuard(IRB); +} + void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isOSMSVCRT()) { @@ -2027,13 +2041,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { auto *SecurityCheckCookie = cast<Function>( M.getOrInsertFunction("__security_check_cookie", Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()), nullptr)); + Type::getInt8PtrTy(M.getContext()))); SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); return; } - // glibc has a special slot for the stack guard. - if (Subtarget.isTargetGlibc()) + // glibc, bionic, and Fuchsia have a special slot for the stack guard. + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } @@ -2056,21 +2070,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); - if (!Subtarget.isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); - // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - unsigned AddressSpace, Offset; + if (Subtarget.isTargetAndroid()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x24 on i386 + unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } - // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x24 on i386 - Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; - AddressSpace = getAddressSpace(); - return ConstantExpr::getIntToPtr( - ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); + // Fuchsia is similar. + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + return SegmentOffset(IRB, 0x18, getAddressSpace()); + } + + return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -2160,6 +2176,12 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + // In some cases we need to disable registers from the default CSR list. + // For example, when they are used for argument passing. + bool ShouldDisableCalleeSavedRegister = + CallConv == CallingConv::X86_RegCall || + MF.getFunction()->hasFnAttribute("no_caller_saved_registers"); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); @@ -2179,6 +2201,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); + + // Add the register to the CalleeSaveDisableRegs list. + if (ShouldDisableCalleeSavedRegister) + MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); + SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); @@ -2203,15 +2230,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (ValVT == MVT::f64 && + (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. - if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) - report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. @@ -2253,6 +2282,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); + + // Add the second register to the CalleeSaveDisableRegs list. + if (ShouldDisableCalleeSavedRegister) + MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } @@ -2309,6 +2342,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + + // Add the returned register to the CalleeSaveDisableRegs list. + if (ShouldDisableCalleeSavedRegister) + MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -2444,7 +2481,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, // Convert the i32 type into v32i1 type Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); - // Concantenate the two values together + // Concatenate the two values together return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } @@ -2456,6 +2493,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SelectionDAG &DAG) { SDValue ValReturned = ValArg; + if (ValVT == MVT::v1i1) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); + if (ValVT == MVT::v64i1) { // In 32 bit machine, this case is handled by getv64i1Argument assert(ValLoc == MVT::i64 && "Expecting only i64 locations"); @@ -2478,7 +2518,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); } - return DAG.getBitcast(ValVT, ValReturned); } @@ -2488,8 +2527,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, + uint32_t *RegMask) const { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget.is64Bit(); @@ -2503,10 +2544,19 @@ SDValue X86TargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); + // In some calling conventions we need to remove the used registers + // from the register mask. + if (RegMask) { + for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + } + // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -2624,7 +2674,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: - case CallingConv::X86_64_Win64: + case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: @@ -2643,13 +2693,13 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } -bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; - CallSite CS(CI); + ImmutableCallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; @@ -2669,6 +2719,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; + MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same @@ -2686,13 +2737,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { - const X86Subtarget& Subtarget = - static_cast<const X86Subtarget&>(DAG.getSubtarget()); // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + if (Subtarget.is64Bit() && Ins.size() == 2) { + // The stack pointer needs to be realigned for 64 bit handlers with error + // code, so the argument offset changes by 8 bytes. + Offset += 8; + } } // FIXME: For now, all byval parameter objects are marked mutable. This can be @@ -2707,30 +2761,74 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, if (CallConv == CallingConv::X86_INTR) { MFI.setObjectOffset(FI, Offset); } - return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - } else { - int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable); - - // Set SExt or ZExt flag. - if (VA.getLocInfo() == CCValAssign::ZExt) { - MFI.setObjectZExt(FI, true); - } else if (VA.getLocInfo() == CCValAssign::SExt) { - MFI.setObjectSExt(FI, true); + return DAG.getFrameIndex(FI, PtrVT); + } + + // This is an argument in memory. We might be able to perform copy elision. + if (Flags.isCopyElisionCandidate()) { + EVT ArgVT = Ins[i].ArgVT; + SDValue PartAddr; + if (Ins[i].PartOffset == 0) { + // If this is a one-part value or the first part of a multi-part value, + // create a stack object for the entire argument value type and return a + // load from our portion of it. This assumes that if the first part of an + // argument is in memory, the rest will also be in memory. + int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), + /*Immutable=*/false); + PartAddr = DAG.getFrameIndex(FI, PtrVT); + return DAG.getLoad( + ValVT, dl, Chain, PartAddr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + // This is not the first piece of an argument in memory. See if there is + // already a fixed stack object including this offset. If so, assume it + // was created by the PartOffset == 0 branch above and create a load from + // the appropriate offset into it. + int64_t PartBegin = VA.getLocMemOffset(); + int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; + int FI = MFI.getObjectIndexBegin(); + for (; MFI.isFixedObjectIndex(FI); ++FI) { + int64_t ObjBegin = MFI.getObjectOffset(FI); + int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); + if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) + break; + } + if (MFI.isFixedObjectIndex(FI)) { + SDValue Addr = + DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), + DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); + return DAG.getLoad( + ValVT, dl, Chain, Addr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, + Ins[i].PartOffset)); + } } + } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } + int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, + VA.getLocMemOffset(), isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad( - ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? - DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; + // Set SExt or ZExt flag. + if (VA.getLocInfo() == CCValAssign::ZExt) { + MFI.setObjectZExt(FI, true); + } else if (VA.getLocInfo() == CCValAssign::SExt) { + MFI.setObjectSExt(FI, true); } + + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI.setObjectOffset(FI, Offset); + } + + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + return ExtendedInMem + ? (VA.getValVT().isVector() + ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) + : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) + : Val; } // FIXME: Get this from tablegen. @@ -2781,12 +2879,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +#ifndef NDEBUG static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } +#endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2836,8 +2936,8 @@ SDValue X86TargetLowering::LowerFormalArguments( // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; @@ -2853,7 +2953,7 @@ SDValue X86TargetLowering::LowerFormalArguments( "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are - // compiled to 32 bit arch, are splited up into two registers. + // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { @@ -2878,7 +2978,7 @@ SDValue X86TargetLowering::LowerFormalArguments( RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; - else if (RegVT == MVT::i1) + else if (RegVT == MVT::v1i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; @@ -3107,8 +3207,9 @@ SDValue X86TargetLowering::LowerFormalArguments( MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { - // X86 interrupts must pop the error code if present - FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); + // X86 interrupts must pop the error code (and the alignment padding) if + // present. + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. @@ -3146,6 +3247,13 @@ SDValue X86TargetLowering::LowerFormalArguments( } } + if (CallConv == CallingConv::X86_RegCall || + Fn->hasFnAttribute("no_caller_saved_registers")) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end())) + MF.getRegInfo().disableCalleeSavedRegister(Pair.first); + } + return Chain; } @@ -3232,6 +3340,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + const CallInst *CI = + CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr; + const Function *Fn = CI ? CI->getCalledFunction() : nullptr; + bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || + (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); @@ -3333,8 +3446,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, + NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -3348,8 +3461,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. @@ -3517,7 +3630,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && - "Expecting custome case only in regcall calling convention"); + "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; @@ -3644,7 +3757,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); + // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we + // set X86_INTR calling convention because it has the same CSR mask + // (same preserved registers). + const uint32_t *Mask = RegInfo->getCallPreservedMask( + MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based @@ -3662,7 +3779,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Mask = RegInfo->getNoPreservedMask(); } - Ops.push_back(DAG.getRegisterMask(Mask)); + // Define a new register mask from the existing mask. + uint32_t *RegMask = nullptr; + + // In some calling conventions we need to remove the used physical registers + // from the reg mask. + if (CallConv == CallingConv::X86_RegCall || HasNCSR) { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // Allocate a new Reg Mask and copy Mask. + RegMask = MF.allocateRegisterMask(TRI->getNumRegs()); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize); + + // Make sure all sub registers of the argument registers are reset + // in the RegMask. + for (auto const &RegPair : RegsToPass) + for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + + // Create the RegMask Operand according to our updated mask. + Ops.push_back(DAG.getRegisterMask(RegMask)); + } else { + // Create the RegMask Operand according to the static mask. + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -3715,8 +3857,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, - Ins, dl, DAG, InVals); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, RegMask); } //===----------------------------------------------------------------------===// @@ -3847,6 +3989,13 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, if (Offset != MFI.getObjectOffset(FI)) return false; + // If this is not byval, check that the argument stack object is immutable. + // inalloca and argument copy elision can create mutable argument stack + // objects. Byval objects can be mutated, but a byval call intends to pass the + // mutated memory. + if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) + return false; + if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) { // If the argument location is wider than the argument type, check that any // extension flags match. @@ -4088,6 +4237,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -4132,6 +4283,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { return true; // 'Faux' Target Shuffles. case ISD::AND: + case X86ISD::ANDNP: return true; } } @@ -4448,6 +4600,11 @@ bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } +bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + return true; +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -4460,6 +4617,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { return true; } +MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { + MVT VT = MVT::getIntegerVT(NumBits); + if (isTypeLegal(VT)) + return VT; + + // PMOVMSKB can handle this. + if (NumBits == 128 && isTypeLegal(MVT::v16i8)) + return MVT::v16i8; + + // VPMOVMSKB can handle this. + if (NumBits == 256 && isTypeLegal(MVT::v32i8)) + return MVT::v32i8; + + // TODO: Allow 64-bit type for 32-bit target. + // TODO: 512-bit types should be allowed, but make sure that those + // cases are handled in combineVectorSizedSetCCEquality(). + + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); @@ -4555,28 +4732,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, SmallVectorImpl<int> &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + int M0 = Mask[i]; + int M1 = Mask[i + 1]; + // If both elements are undef, its trivial. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && - Mask[i + 1] % 2 == 1) { - WidenedMask[i / 2] = Mask[i + 1] / 2; + if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { + WidenedMask[i / 2] = M1 / 2; continue; } - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { + WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { + if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && + (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } @@ -4585,9 +4764,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, // Finally check if the two mask values are adjacent and aligned with // a pair. - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && - Mask[i] + 1 == Mask[i + 1]) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { + WidenedMask[i / 2] = M0 / 2; continue; } @@ -4608,7 +4786,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, SmallVectorImpl<int> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); int NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); + ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; @@ -4634,14 +4812,10 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { return false; // The index should be aligned on a vecWidth-bit boundary. - uint64_t Index = - cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - + uint64_t Index = N->getConstantOperandVal(1); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getScalarSizeInBits(); - bool Result = (Index * ElSize) % vecWidth == 0; - - return Result; + return (Index * ElSize) % vecWidth == 0; } /// Return true if the specified INSERT_SUBVECTOR @@ -4651,15 +4825,12 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) return false; - // The index should be aligned on a vecWidth-bit boundary. - uint64_t Index = - cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + // The index should be aligned on a vecWidth-bit boundary. + uint64_t Index = N->getConstantOperandVal(2); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getScalarSizeInBits(); - bool Result = (Index * ElSize) % vecWidth == 0; - - return Result; + return (Index * ElSize) % vecWidth == 0; } bool X86::isVINSERT128Index(SDNode *N) { @@ -4683,13 +4854,9 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && "Illegal extract subvector for VEXTRACT"); - uint64_t Index = - cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); - + uint64_t Index = N->getConstantOperandVal(1); MVT VecVT = N->getOperand(0).getSimpleValueType(); - MVT ElVT = VecVT.getVectorElementType(); - - unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits(); return Index / NumElemsPerChunk; } @@ -4698,13 +4865,9 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && "Illegal insert subvector for VINSERT"); - uint64_t Index = - cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); - + uint64_t Index = N->getConstantOperandVal(2); MVT VecVT = N->getSimpleValueType(0); - MVT ElVT = VecVT.getVectorElementType(); - - unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); + unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits(); return Index / NumElemsPerChunk; } @@ -4737,9 +4900,9 @@ bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } -// Build a vector of constants +// Build a vector of constants. // Use an UNDEF node if MaskElt == -1. -// Spilt 64-bit constants in the 32-bit mode. +// Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { @@ -4770,9 +4933,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } -static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs, +static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); + assert(Bits.size() == Undefs.getBitWidth() && + "Unequal constant and undef arrays"); SmallVector<SDValue, 32> Ops; bool Split = false; @@ -4844,10 +5008,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); - // Extract from UNDEF is UNDEF. - if (Vec.isUndef()) - return DAG.getUNDEF(ResultVT); - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); @@ -4858,8 +5018,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector( + ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -4918,50 +5078,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - - // For insertion into the zero index (low half) of a 256-bit vector, it is - // more efficient to generate a blend with immediate instead of an insert*128. - // We are still creating an INSERT_SUBVECTOR below with an undef node to - // extend the subvector to the size of the result vector. Make sure that - // we are not recursing on that node by checking for undef here. - if (IdxVal == 0 && Result.getValueType().is256BitVector() && - !Result.isUndef()) { - EVT ResultVT = Result.getValueType(); - SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResultVT); - SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, - Vec, ZeroIndex); - - // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); - if (ScalarType.isFloatingPoint()) { - // Choose either vblendps (float) or vblendpd (double). - unsigned ScalarSize = ScalarType.getSizeInBits(); - assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); - unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); - } - - const X86Subtarget &Subtarget = - static_cast<const X86Subtarget &>(DAG.getSubtarget()); - - // AVX2 is needed for 256-bit integer blend support. - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); - Result = DAG.getBitcast(CastVT, Result); - Vec256 = DAG.getBitcast(CastVT, Vec256); - Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); - return DAG.getBitcast(ResultVT, Vec256); - } - return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } @@ -4971,6 +5087,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +// Return true if the instruction zeroes the unused upper part of the +// destination and accepts mask. +static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { + switch (Opcode) { + default: + return false; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return true; + } +} + /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -5003,6 +5133,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) + // If this node widens - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark this node as legal to enable replacing them with + // the v8i1 version of the previous instruction during instruction selection. + // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg, + // while zeroing all the upper remaining 60 bits of the register. if the + // result of such instruction is inserted into an allZeroVector, then we can + // safely remove insert_vector (in instruction selection) as the cmp instr + // already zeroed the rest of the register. + if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 && + (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) || + (SubVec.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode()))))) + return Op; + // extend to natively supported kshift MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; MVT WideOpVT = OpVT; @@ -5023,7 +5169,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { if (IdxVal != 0) { SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + ShiftBits); } return ExtractSubVec(WideSubVec); } @@ -5032,9 +5179,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; return ExtractSubVec(Vec); } @@ -5043,8 +5190,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), @@ -5056,12 +5203,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); return ExtractSubVec(Vec); } @@ -5094,26 +5241,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, } /// Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with -/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, - SelectionDAG &DAG, const SDLoc &dl) { +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; - SDValue Vec; - if (!Subtarget.hasInt256() && NumElts == 8) { - Vec = DAG.getConstant(Ones, dl, MVT::v4i32); - Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); - } else { - Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); - } + SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } +static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, + SelectionDAG &DAG) { + EVT InVT = In.getValueType(); + assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); + + if (VT.is128BitVector() && InVT.is128BitVector()) + return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT) + : DAG.getZeroExtendVectorInReg(In, DL, VT); + + // For 256-bit vectors, we only need the lower (128-bit) input half. + // For 512-bit vectors, we only need the lower input half or quarter. + if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { + int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); + In = extractSubVector(In, 0, DAG, DL, + std::max(128, (int)VT.getSizeInBits() / Scale)); + } + + return DAG.getNode(Opc, DL, VT, In); +} + /// Generate unpacklo/unpackhi shuffle mask. static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, bool Unary) { @@ -5199,9 +5358,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, - SmallBitVector &UndefElts, - SmallVectorImpl<APInt> &EltBits) { - assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + APInt &UndefElts, + SmallVectorImpl<APInt> &EltBits, + bool AllowWholeUndefs = true, + bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); @@ -5211,174 +5371,173 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; - // Extract all the undef/constant element data and pack into single bitsets. - APInt UndefBits(SizeInBits, 0); - APInt MaskBits(SizeInBits, 0); + // Bitcast a source array of element bits to the target size. + auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { + unsigned NumSrcElts = UndefSrcElts.getBitWidth(); + unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); + assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && + "Constant bit sizes don't match"); + + // Don't split if we don't allow undef bits. + bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; + if (UndefSrcElts.getBoolValue() && !AllowUndefs) + return false; + + // If we're already the right size, don't bother bitcasting. + if (NumSrcElts == NumElts) { + UndefElts = UndefSrcElts; + EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); + return true; + } + + // Extract all the undef/constant element data and pack into single bitsets. + APInt UndefBits(SizeInBits, 0); + APInt MaskBits(SizeInBits, 0); - // Split the undef/constant single bitset data into the target elements. - auto SplitBitData = [&]() { - UndefElts = SmallBitVector(NumElts, false); + for (unsigned i = 0; i != NumSrcElts; ++i) { + unsigned BitOffset = i * SrcEltSizeInBits; + if (UndefSrcElts[i]) + UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); + MaskBits.insertBits(SrcEltBits[i], BitOffset); + } + + // Split the undef/constant single bitset data into the target elements. + UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { - APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); - UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + unsigned BitOffset = i * EltSizeInBits; + APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); - // Only treat an element as UNDEF if all bits are UNDEF, otherwise - // treat it as zero. + // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { - UndefElts[i] = true; + if (!AllowWholeUndefs) + return false; + UndefElts.setBit(i); continue; } - APInt Bits = MaskBits.lshr(i * EltSizeInBits); - Bits = Bits.zextOrTrunc(EltSizeInBits); + // If only some bits are UNDEF then treat them as zero (or bail if not + // supported). + if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) + return false; + + APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); EltBits[i] = Bits.getZExtValue(); } return true; }; - auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask, - APInt &Undefs) { + // Collect constant bits and insert into mask/undef bit masks. + auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, + unsigned UndefBitIndex) { if (!Cst) return false; - unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits(); if (isa<UndefValue>(Cst)) { - Mask = APInt::getNullValue(SizeInBits); - Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits); + Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { - Mask = CInt->getValue().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { - Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } return false; }; + // Extract constant bits from build vector. + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + const SDValue &Src = Op.getOperand(i); + if (Src.isUndef()) { + UndefSrcElts.setBit(i); + continue; + } + auto *Cst = cast<ConstantSDNode>(Src); + SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); + } + return CastBitData(UndefSrcElts, SrcEltBits); + } + // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits())) return false; - unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); - for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { - APInt Bits, Undefs; - if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs)) + unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); + unsigned NumSrcElts = CstTy->getVectorNumElements(); + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); + for (unsigned i = 0; i != NumSrcElts; ++i) + if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], + UndefSrcElts, i)) return false; - MaskBits |= Bits.shl(i * CstEltSizeInBits); - UndefBits |= Undefs.shl(i * CstEltSizeInBits); - } - return SplitBitData(); + return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && - EltSizeInBits <= Op.getScalarValueSizeInBits()) { + EltSizeInBits <= VT.getScalarSizeInBits()) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { - APInt Bits, Undefs; - if (ExtractConstantBits(Broadcast, Bits, Undefs)) { - unsigned NumBroadcastBits = Op.getScalarValueSizeInBits(); - unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits; - for (unsigned i = 0; i != NumBroadcastElts; ++i) { - MaskBits |= Bits.shl(i * NumBroadcastBits); - UndefBits |= Undefs.shl(i * NumBroadcastBits); - } - return SplitBitData(); + unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); + if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { + if (UndefSrcElts[0]) + UndefSrcElts.setBits(0, NumSrcElts); + SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); + return CastBitData(UndefSrcElts, SrcEltBits); } } } + // Extract a rematerialized scalar constant insertion. + if (Op.getOpcode() == X86ISD::VZEXT_MOVL && + Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) { + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits; + auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0)); + SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); + SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); + return CastBitData(UndefSrcElts, SrcEltBits); + } + return false; } -// TODO: Merge more of this with getTargetConstantBitsFromNode. static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl<uint64_t> &RawMask) { - MaskNode = peekThroughBitcasts(MaskNode); - - MVT VT = MaskNode.getSimpleValueType(); - assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); - unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits; - - // Split an APInt element into MaskEltSizeInBits sized pieces and - // insert into the shuffle mask. - auto SplitElementToMask = [&](APInt Element) { - // Note that this is x86 and so always little endian: the low byte is - // the first byte of the mask. - int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits; - for (int i = 0; i < Split; ++i) { - APInt RawElt = Element.getLoBits(MaskEltSizeInBits); - Element = Element.lshr(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - }; - - if (MaskNode.getOpcode() == X86ISD::VBROADCAST) { - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0 - if (VT.getScalarSizeInBits() != MaskEltSizeInBits) - return false; - if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) { - const APInt &MaskElement = CN->getAPIntValue(); - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - } - return false; - } - - if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && - MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { - SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); - if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { - if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) { - RawMask.push_back(CN->getZExtValue()); - RawMask.append(NumMaskElts - 1, 0); - return true; - } - - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; - SplitElementToMask(CN->getAPIntValue()); - RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); - return true; - } - } - return false; - } - - if (MaskNode.getOpcode() != ISD::BUILD_VECTOR) - return false; - - // We can always decode if the buildvector is all zero constants, - // but can't use isBuildVectorAllZeros as it might contain UNDEFs. - if (all_of(MaskNode->ops(), X86::isZeroNode)) { - RawMask.append(NumMaskElts, 0); - return true; - } - - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + + // Extract the raw target constant bits. + // FIXME: We currently don't support UNDEF bits or mask entries. + if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, + EltBits, /* AllowWholeUndefs */ false, + /* AllowPartialUndefs */ false)) return false; - for (SDValue Op : MaskNode->ops()) { - if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode())) - SplitElementToMask(CN->getAPIntValue()); - else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode())) - SplitElementToMask(CFN->getValueAPF().bitcastToAPInt()); - else - return false; - } + // Insert the extracted elements into the mask. + for (APInt Elt : EltBits) + RawMask.push_back(Elt.getZExtValue()); return true; } @@ -5405,6 +5564,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); @@ -5416,6 +5576,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::EXTRQI: + if (isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) { + int BitLen = N->getConstantOperandVal(1); + int BitIdx = N->getConstantOperandVal(2); + DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = true; + } + break; + case X86ISD::INSERTQI: + if (isa<ConstantSDNode>(N->getOperand(2)) && + isa<ConstantSDNode>(N->getOperand(3))) { + int BitLen = N->getConstantOperandVal(2); + int BitIdx = N->getConstantOperandVal(3); + DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + } + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -5473,8 +5651,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, IsUnary = true; break; case X86ISD::VBROADCAST: { - // We only decode broadcasts of same-sized vectors at the moment. - if (N->getOperand(0).getValueType() == VT) { + SDValue N0 = N->getOperand(0); + // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, + // add the pre-extracted value to the Ops vector. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == VT && + N0.getConstantOperandVal(1) == 0) + Ops.push_back(N0.getOperand(0)); + + // We only decode broadcasts of same-sized vectors, unless the broadcast + // came from an extract from the original width. If we found one, we + // pushed it the Ops vector above. + if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; @@ -5669,6 +5857,19 @@ static bool setTargetShuffleZeroElements(SDValue N, V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); + assert((VT.getSizeInBits() % Mask.size()) == 0 && + "Illegal split of shuffle value type"); + unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + + // Extract known constant input data. + APInt UndefSrcElts[2]; + SmallVector<APInt, 32> SrcEltBits[2]; + bool IsSrcConstant[2] = { + getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], + SrcEltBits[0], true, false), + getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], + SrcEltBits[1], true, false)}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; @@ -5677,6 +5878,7 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; // Determine shuffle input and normalize the mask. + unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; @@ -5686,39 +5888,27 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; } - // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. - if (V.getOpcode() != ISD::BUILD_VECTOR) - continue; - - // If the BUILD_VECTOR has fewer elements then the (larger) source - // element must be UNDEF/ZERO. - // TODO: Is it worth testing the individual bits of a constant? - if ((Size % V.getNumOperands()) == 0) { - int Scale = Size / V->getNumOperands(); - SDValue Op = V.getOperand(M / Scale); - if (Op.isUndef()) + // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. + // TODO: We currently only set UNDEF for integer types - floats use the same + // registers as vectors and many of the scalar folded loads rely on the + // SCALAR_TO_VECTOR pattern. + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Size % V.getValueType().getVectorNumElements()) == 0) { + int Scale = Size / V.getValueType().getVectorNumElements(); + int Idx = M / Scale; + if (Idx != 0 && !VT.isFloatingPoint()) Mask[i] = SM_SentinelUndef; - else if (X86::isZeroNode(Op)) + else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) Mask[i] = SM_SentinelZero; continue; } - // If the BUILD_VECTOR has more elements then all the (smaller) source - // elements must be all UNDEF or all ZERO. - if ((V.getNumOperands() % Size) == 0) { - int Scale = V->getNumOperands() / Size; - bool AllUndef = true; - bool AllZero = true; - for (int j = 0; j < Scale; ++j) { - SDValue Op = V.getOperand((M * Scale) + j); - AllUndef &= Op.isUndef(); - AllZero &= X86::isZeroNode(Op); - } - if (AllUndef) + // Attempt to extract from the source's constant bits. + if (IsSrcConstant[SrcIdx]) { + if (UndefSrcElts[SrcIdx][M]) Mask[i] = SM_SentinelUndef; - else if (AllZero) + else if (SrcEltBits[SrcIdx][M] == 0) Mask[i] = SM_SentinelZero; - continue; } } @@ -5731,7 +5921,8 @@ static bool setTargetShuffleZeroElements(SDValue N, // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, - SmallVectorImpl<SDValue> &Ops) { + SmallVectorImpl<SDValue> &Ops, + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -5744,11 +5935,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned Opcode = N.getOpcode(); switch (Opcode) { - case ISD::AND: { + case ISD::AND: + case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. - SmallBitVector UndefElts; + APInt UndefElts; SmallVector<APInt, 32> EltBits; - if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits)) + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + bool IsAndN = (X86ISD::ANDNP == Opcode); + uint64_t ZeroMask = IsAndN ? 255 : 0; + if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { @@ -5758,9 +5954,93 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; - Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i); + Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); + } + Ops.push_back(IsAndN ? N1 : N0); + return true; + } + case ISD::SCALAR_TO_VECTOR: { + // Match against a scalar_to_vector of an extract from a vector, + // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. + SDValue N0 = N.getOperand(0); + SDValue SrcExtract; + + if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getOperand(0).getValueType() == VT) { + SrcExtract = N0; + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRW && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16); + } else if (N0.getOpcode() == ISD::AssertZext && + N0.getOperand(0).getOpcode() == X86ISD::PEXTRB && + cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) { + SrcExtract = N0.getOperand(0); + assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8); + } + + if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) + return false; + + SDValue SrcVec = SrcExtract.getOperand(0); + EVT SrcVT = SrcVec.getValueType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; + + unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); + if (NumSrcElts <= SrcIdx) + return false; + + Ops.push_back(SrcVec); + Mask.push_back(SrcIdx); + Mask.append(NumZeros, SM_SentinelZero); + Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); + return true; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue InVec = N.getOperand(0); + SDValue InScl = N.getOperand(1); + uint64_t InIdx = N.getConstantOperandVal(2); + assert(InIdx < NumElts && "Illegal insertion index"); + + // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. + if (X86::isZeroNode(InScl)) { + Ops.push_back(InVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); + return true; } + + // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern. + // TODO: Expand this to support INSERT_VECTOR_ELT/etc. + unsigned ExOp = + (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); + if (InScl.getOpcode() != ISD::AssertZext || + InScl.getOperand(0).getOpcode() != ExOp) + return false; + + SDValue ExVec = InScl.getOperand(0).getOperand(0); + uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1); + assert(ExIdx < NumElts && "Illegal extraction index"); + Ops.push_back(InVec); + Ops.push_back(ExVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? NumElts + ExIdx : i); + return true; + } + case X86ISD::PACKSS: { + // If we know input saturation won't happen we can treat this + // as a truncation shuffle. + if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt || + DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt) + return false; + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i * 2); return true; } case X86ISD::VSHLI: @@ -5795,6 +6075,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, } return true; } + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VZEXT: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); @@ -5810,36 +6091,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return false; } +/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask) { + int MaskWidth = Mask.size(); + SmallVector<SDValue, 16> UsedInputs; + for (int i = 0, e = Inputs.size(); i < e; ++i) { + int lo = UsedInputs.size() * MaskWidth; + int hi = lo + MaskWidth; + if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedInputs.push_back(Inputs[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; + } + Inputs = UsedInputs; +} + /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the -/// Op0/Op1 inputs accordingly. +/// inputs accordingly. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, - SmallVectorImpl<int> &Mask) { - SmallVector<SDValue, 2> Ops; - if (!setTargetShuffleZeroElements(Op, Mask, Ops)) - if (!getFauxShuffleMask(Op, Mask, Ops)) +static bool resolveTargetShuffleInputs(SDValue Op, + SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG) { + if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) + if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) return false; - int NumElts = Mask.size(); - bool Op0InUse = any_of(Mask, [NumElts](int Idx) { - return 0 <= Idx && Idx < NumElts; - }); - bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; }); - - Op0 = Op0InUse ? Ops[0] : SDValue(); - Op1 = Op1InUse ? Ops[1] : SDValue(); - - // We're only using Op1 - commute the mask and inputs. - if (!Op0InUse && Op1InUse) { - for (int &M : Mask) - if (NumElts <= M) - M -= NumElts; - Op0 = Op1; - Op1 = SDValue(); - } - + resolveTargetShuffleInputsAndMask(Inputs, Mask); return true; } @@ -5914,11 +6198,10 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, - unsigned NumNonZero, unsigned NumZero, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (NumNonZero > 8) + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); @@ -5928,18 +6211,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v16i8); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v16i8, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v16i8, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -5958,24 +6249,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { + // FIXME: Investigate extending to i32 instead of just i16. + // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; if (LastIsNonZero) { - LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, - MVT::i16, Op.getOperand(i-1)); + LastElt = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, - ThisElt, DAG.getConstant(8, dl, MVT::i8)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, + DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; - if (ThisElt.getNode()) - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i/2, dl)); + if (ThisElt) { + if (1 == i) { + V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) + : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + } else { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i / 2, dl)); + } + } } } @@ -5986,27 +6288,34 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (NumNonZero > 4) + const X86Subtarget &Subtarget) { + if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v8i16); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v8i16, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -6015,8 +6324,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + const X86Subtarget &Subtarget) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { @@ -6064,7 +6372,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. - EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; @@ -6095,8 +6403,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; - CanFold = SrcVector == V1 && - cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; + CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i); } if (!CanFold) @@ -6212,7 +6519,8 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, /// /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, - SDLoc &DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget, bool isAfterLegalize) { unsigned NumElems = Elts.size(); @@ -6288,16 +6596,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); - - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - } - + DAG.makeEquivalentMemoryOrdering(LDBase, NewLd); return NewLd; }; @@ -6317,6 +6616,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); + // Don't create 256-bit non-temporal aligned loads without AVX2 as these + // will lower to regular temporal loads and use the cache. + if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 && + VT.is256BitVector() && !Subtarget.hasInt256()) + return SDValue(); + if (IsConsecutiveLoad) return CreateLoad(VT, LDBase); @@ -6356,19 +6661,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); - - // Make sure the newly-created LOAD is in the same position as LDBase in - // terms of dependency. We create a TokenFactor for LDBase and ResNode, - // and update uses of LDBase's output chain to use the TokenFactor. - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), - SDValue(ResNode.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(ResNode.getNode(), 1)); - } - + DAG.makeEquivalentMemoryOrdering(LDBase, ResNode); return DAG.getBitcast(VT, ResNode); } } @@ -6376,22 +6669,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, return SDValue(); } -static Constant *getConstantVector(MVT VT, APInt SplatValue, +static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector<Constant *, 32> ConstantVec; for (unsigned i = 0; i < NumElm; i++) { - APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize); + APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { - assert((ScalarSize == 32 || ScalarSize == 64) && - "Unsupported floating point scalar size"); - if (ScalarSize == 32) - Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); - else - Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + if (ScalarSize == 32) { + Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); + } else { + assert(ScalarSize == 64 && "Unsupported floating point scalar size"); + Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); + } } else Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); ConstantVec.push_back(Const); @@ -6409,18 +6702,16 @@ static bool isUseOfShuffle(SDNode *N) { return false; } -/// Attempt to use the vbroadcast instruction to generate a splat value for the -/// following cases: -/// 1. A splat BUILD_VECTOR which uses: -/// a. A single scalar load, or a constant. -/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). -/// 2. A splat shuffle which uses a scalar_to_vector node which comes from -/// a scalar load, or a constant. +/// Attempt to use the vbroadcast instruction to generate a splat value +/// from a splat BUILD_VECTOR which uses: +/// a. A single scalar load, or a constant. +/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. -static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. @@ -6479,11 +6770,13 @@ static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget // AVX have support for 32 and 64 bit broadcast for floats only. // No 64bit integer in 32bit subtarget. MVT CVT = MVT::getFloatingPointVT(SplatBitSize); - Constant *C = SplatBitSize == 32 - ? ConstantFP::get(Type::getFloatTy(*Ctx), - SplatValue.bitsToFloat()) - : ConstantFP::get(Type::getDoubleTy(*Ctx), - SplatValue.bitsToDouble()); + // Lower the splat via APFloat directly, to avoid any conversion. + Constant *C = + SplatBitSize == 32 + ? ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEsingle(), SplatValue)) + : ConstantFP::get(*Ctx, + APFloat(APFloat::IEEEdouble(), SplatValue)); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; @@ -6664,6 +6957,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. if (!isa<ConstantSDNode>(ExtIdx)) return SDValue(); @@ -6694,11 +6988,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); - for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { - unsigned Idx = InsertIndices[i]; + + for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); - } return NV; } @@ -6711,7 +7004,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (!In.isUndef()) - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); @@ -6754,7 +7047,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(In)) NonConstIdx.push_back(idx); else { - Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; HasConstElts = true; } if (SplatIdx < 0) @@ -6765,9 +7058,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) - return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), - DAG.getConstant(1, dl, VT), - DAG.getConstant(0, dl, VT)); + return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; @@ -7347,7 +7640,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); } return SDValue(); @@ -7373,7 +7666,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; - if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG)) + if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; @@ -7418,7 +7711,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && (!IsAllConstants || Idx == 0)) { - if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; @@ -7523,7 +7816,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); - if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false)) + if (SDValue LD = + EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } @@ -7561,17 +7855,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) - if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. @@ -7647,24 +7941,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // Next, we iteratively mix elements, e.g. for v4f32: - // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> - // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> - // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> - unsigned EltStride = NumElems >> 1; - while (EltStride != 0) { - for (unsigned i = 0; i < EltStride; ++i) { - // If Ops[i+EltStride] is undef and this is the first round of mixing, - // then it is safe to just drop this shuffle: V[i] is already in the - // right place, the one element (since it's the first round) being - // inserted as undef can be dropped. This isn't safe for successive - // rounds because they will permute elements within both vectors. - if (Ops[i+EltStride].isUndef() && - EltStride == NumElems/2) - continue; - - Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]); - } - EltStride >>= 1; + // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> + // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> + // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> + for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { + // Generate scaled UNPCKL shuffle mask. + SmallVector<int, 16> Mask; + for(unsigned i = 0; i != Scale; ++i) + Mask.push_back(i); + for (unsigned i = 0; i != Scale; ++i) + Mask.push_back(NumElems+i); + Mask.append(NumElems - Mask.size(), SM_SentinelUndef); + + for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) + Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } @@ -7699,6 +7989,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } +// Return true if all the operands of the given CONCAT_VECTORS node are zeros +// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0) +static bool isExpandWithZeros(const SDValue &Op) { + assert(Op.getOpcode() == ISD::CONCAT_VECTORS && + "Expand with zeros only possible in CONCAT_VECTORS nodes!"); + + for (unsigned i = 1; i < Op.getNumOperands(); i++) + if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode())) + return false; + + return true; +} + +// Returns true if the given node is a type promotion (by concatenating i1 +// zeros) of the result of a node that already zeros all upper bits of +// k-register. +static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) { + unsigned Opc = Op.getOpcode(); + + assert(Opc == ISD::CONCAT_VECTORS && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected node to check for type promotion!"); + + // As long as we are concatenating zeros to the upper part of a previous node + // result, climb up the tree until a node with different opcode is + // encountered + while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) { + if (Opc == ISD::INSERT_SUBVECTOR) { + if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) && + Op.getConstantOperandVal(2) == 0) + Op = Op.getOperand(1); + else + return SDValue(); + } else { // Opc == ISD::CONCAT_VECTORS + if (isExpandWithZeros(Op)) + Op = Op.getOperand(0); + else + return SDValue(); + } + Opc = Op.getOpcode(); + } + + // Check if the first inserted node zeroes the upper bits, or an 'and' result + // of a node that zeros the upper bits (its masked version). + if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) || + (Op.getOpcode() == ISD::AND && + (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) || + isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) { + return Op; + } + + return SDValue(); +} + static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { @@ -7709,6 +8053,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + // If this node promotes - by concatenating zeroes - the type of the result + // of a node with instruction that zeroes all upper (irrelevant) bits of the + // output register, mark it as legal and catch the pattern in instruction + // selection to avoid emitting extra insturctions (for zeroing upper bits). + if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { + SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64); + SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, + ZeroC); + } + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. @@ -7767,7 +8122,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); if (V1.isUndef()) - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); if (IsZeroV1) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); @@ -7849,7 +8204,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef<int> Mask, SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); + auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { @@ -7956,7 +8311,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } -} + } return true; } @@ -7986,6 +8341,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, return true; } +// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle +// mask. +static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask, + const APInt &Zeroable) { + int NumElts = Mask.size(); + assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); + + SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); + TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); + } + return TargetMask; +} + +// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd +// instructions. +static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return false; + + SmallVector<int, 8> Unpcklwd; + createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, + /* Unary = */ false); + SmallVector<int, 8> Unpckhwd; + createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, + /* Unary = */ false); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || + isTargetShuffleEquivalent(Mask, Unpckhwd)); + return IsUnpackwdMask; +} + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -8009,7 +8399,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { return Imm; } -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } @@ -8022,9 +8412,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); +static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); @@ -8039,7 +8429,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; + Zeroable.setBit(i); continue; } @@ -8057,17 +8447,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable[i] = true; + Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { APInt Val = Cst->getAPIntValue(); - Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); - Val = Val.lshr((M % Scale) * ScalarSizeInBits); + Val.lshrInPlace((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } continue; } @@ -8081,7 +8473,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } - Zeroable[i] = AllZeroable; + if (AllZeroable) + Zeroable.setBit(i); continue; } } @@ -8096,19 +8489,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. -static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, - ArrayRef<int> Mask,const EVT &VectorType, +static bool isNonZeroElementsInOrder(const APInt &Zeroable, + ArrayRef<int> Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. - for (int i = 0, e = Zeroable.size(); i < e; i++) { + for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. - if (Mask[i] == -1) + assert(Mask[i] >= -1 && "Out of bound mask element!"); + if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element - if (NextElement == -1) { + if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } @@ -8124,7 +8518,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8179,19 +8573,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); -// Function convertBitVectorToUnsigned - The function gets SmallBitVector -// as argument and convert him to unsigned. -// The output of the function is not(zeroable) -static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { - unsigned convertBit = 0; - for (int i = 0, e = Zeroable.size(); i < e; i++) - convertBit |= !(Zeroable[i]) << i; - return convertBit; -} - // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8199,7 +8583,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); - unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); @@ -8210,9 +8594,94 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getNode(ISD::VSELECT, DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getSelect(DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); +} + +static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, + unsigned &UnpackOpcode, bool IsUnary, + ArrayRef<int> TargetMask, SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + int NumElts = VT.getVectorNumElements(); + + bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; + for (int i = 0; i != NumElts; i += 2) { + int M1 = TargetMask[i + 0]; + int M2 = TargetMask[i + 1]; + Undef1 &= (SM_SentinelUndef == M1); + Undef2 &= (SM_SentinelUndef == M2); + Zero1 &= isUndefOrZero(M1); + Zero2 &= isUndefOrZero(M2); + } + assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && + "Zeroable shuffle detected"); + + // Attempt to match the target mask against the unpack lo/hi mask patterns. + SmallVector<int, 64> Unpckl, Unpckh; + createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + // If an unary shuffle, attempt to match as an unpack lo/hi with zero. + if (IsUnary && (Zero1 || Zero2)) { + // Don't bother if we can blend instead. + if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && + isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) + return false; + + bool MatchLo = true, MatchHi = true; + for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { + int M = TargetMask[i]; + + // Ignore if the input is known to be zero or the index is undef. + if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || + (M == SM_SentinelUndef)) + continue; + + MatchLo &= (M == Unpckl[i]); + MatchHi &= (M == Unpckh[i]); + } + + if (MatchLo || MatchHi) { + UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + return true; + } + } + + // If a binary shuffle, commute and try again. + if (!IsUnary) { + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + std::swap(V1, V2); + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + std::swap(V1, V2); + return true; + } + } + + return false; } // X86 has dedicated unpack instructions that can handle specific blend @@ -8248,13 +8717,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -8286,10 +8754,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); - int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) @@ -8307,51 +8773,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::OR, DL, VT, V1, V2); } -/// \brief Try to emit a blend instruction for a shuffle. -/// -/// This doesn't do any checks for the availability of instructions for blending -/// these values. It relies on the availability of the X86ISD::BLENDI pattern to -/// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Original, - const SmallBitVector &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - SmallVector<int, 8> Mask(Original.begin(), Original.end()); - bool ForceV1Zero = false, ForceV2Zero = false; +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG); + +static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, + MutableArrayRef<int> TargetMask, + bool &ForceV1Zero, bool &ForceV2Zero, + uint64_t &BlendMask) { + bool V1IsZeroOrUndef = + V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZeroOrUndef = + V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); + + BlendMask = 0; + ForceV1Zero = false, ForceV2Zero = false; + assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - unsigned BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - if (M < 0) + for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { + int M = TargetMask[i]; + if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { - BlendMask |= 1u << i; + BlendMask |= 1ull << i; continue; } - if (Zeroable[i]) { - if (V1IsZero) { + if (M == SM_SentinelZero) { + if (V1IsZeroOrUndef) { ForceV1Zero = true; - Mask[i] = i; + TargetMask[i] = i; continue; } - if (V2IsZero) { + if (V2IsZeroOrUndef) { ForceV2Zero = true; - BlendMask |= 1u << i; - Mask[i] = i + Size; + BlendMask |= 1ull << i; + TargetMask[i] = i + Size; continue; } } - return SDValue(); // Shuffled input! + return false; } + return true; +} + +uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { + uint64_t ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1ull << i)) + ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); + return ScaledMask; +} + +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is a blend, or convertible into a blend with zero. +static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); + + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + BlendMask)) + return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) @@ -8359,15 +8855,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); - auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { - unsigned ScaledMask = 0; - for (int i = 0; i != Size; ++i) - if (BlendMask & (1u << i)) - for (int j = 0; j != Scale; ++j) - ScaledMask |= 1u << (i * Scale + j); - return ScaledMask; - }; - switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -8387,7 +8874,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -8400,7 +8887,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -8417,7 +8904,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) - BlendMask |= 1u << i; + BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } @@ -8428,6 +8915,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + if (Subtarget.hasBWI() && Subtarget.hasVLX()) { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); + } + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) @@ -8462,10 +8956,21 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); + VT, + DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), + V1, V2)); + } + case MVT::v16f32: + case MVT::v8f64: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } - default: llvm_unreachable("Not a supported integer vector type!"); } @@ -8503,7 +9008,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } -/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// \brief Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined @@ -8757,7 +9262,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef<int> Mask, int MaskOffset, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -8819,7 +9324,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8852,132 +9357,145 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getBitcast(VT, V); } -/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, - SelectionDAG &DAG) { +// EXTRQ: Extract Len elements from lower half of source, starting at Idx. +// Remainder of lower half result is zero and upper half is all undef. +static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) - return SDValue(); + return false; - // EXTRQ: Extract Len elements from lower half of source, starting at Idx. - // Remainder of lower half result is zero and upper half is all undef. - auto LowerAsEXTRQ = [&]() { - // Determine the extraction length from the part of the - // lower half that isn't zeroable. - int Len = HalfSize; - for (; Len > 0; --Len) - if (!Zeroable[Len - 1]) - break; - assert(Len > 0 && "Zeroable shuffle mask"); + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len > 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); - // Attempt to match first Len sequential elements from the lower half. - SDValue Src; - int Idx = -1; - for (int i = 0; i != Len; ++i) { - int M = Mask[i]; - if (M < 0) - continue; - SDValue &V = (M < Size ? V1 : V2); - M = M % Size; + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; - // The extracted elements must start at a valid index and all mask - // elements must be in the lower half. - if (i > M || M >= HalfSize) - return SDValue(); + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) + return false; - if (Idx < 0 || (Src == V && Idx == (M - i))) { - Src = V; - Idx = M - i; - continue; - } - return SDValue(); + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; } + return false; + } - if (Idx < 0) - return SDValue(); + if (!Src || Idx < 0) + return false; - assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - }; + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Src; + return true; +} + +// INSERTQ: Extract lowest Len elements from lower half of second source and +// insert over first source, starting at Idx. +// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } +static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx) { + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - if (SDValue ExtrQ = LowerAsEXTRQ()) - return ExtrQ; + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return false; + + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; - // INSERTQ: Extract lowest Len elements from lower half of second source and - // insert over first source, starting at Idx. - // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } - auto LowerAsInsertQ = [&]() { - for (int Idx = 0; Idx != HalfSize; ++Idx) { - SDValue Base; + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } - // Attempt to match first source from mask before insertion point. - if (isUndefInRange(Mask, 0, Idx)) { + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; + + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } + + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { Base = V2; } else { continue; } - // Extend the extraction length looking to match both the insertion of - // the second source and the remaining elements of the first. - for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { - SDValue Insert; - int Len = Hi - Idx; - - // Match insertion. - if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { - Insert = V1; - } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { - Insert = V2; - } else { - continue; - } - - // Match the remaining elements of the lower half. - if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { - /* EMPTY */ - } else if ((!Base || (Base == V1)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { - Base = V1; - } else if ((!Base || (Base == V2)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, - Size + Hi)) { - Base = V2; - } else { - continue; - } - - // We may not have a base (first source) - this can safely be undefined. - if (!Base) - Base = DAG.getUNDEF(VT); - - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - } + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Base; + V2 = Insert; + return true; } + } - return SDValue(); - }; + return false; +} - if (SDValue InsertQ = LowerAsInsertQ()) - return InsertQ; +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + + if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), + V2 ? V2 : DAG.getUNDEF(VT), + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); return SDValue(); } @@ -8987,7 +9505,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and -/// begin and can start from an offseted element index in the input; to +/// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. @@ -9027,21 +9545,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { - // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - - // For 256-bit vectors, we only need the lower (128-bit) input half. - // For 512-bit vectors, we only need the lower input half or quarter. - if (VT.getSizeInBits() > 128) - InputV = extractSubVector(InputV, 0, DAG, DL, - std::max(128, (int)VT.getSizeInBits() / Scale)); - - InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); + InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -9158,7 +9669,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; @@ -9314,7 +9825,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); @@ -9469,7 +9980,6 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. -/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, @@ -9582,17 +10092,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); - - // Make sure the newly-created LOAD is in the same position as Ld in - // terms of dependency. We create a TokenFactor for Ld and V, - // and update uses of Ld's output chain to use the TokenFactor. - if (Ld->hasAnyUseOfValue(1)) { - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(Ld, 1), SDValue(V.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), - SDValue(V.getNode(), 1)); - } + DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); @@ -9612,7 +10112,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (((BroadcastIdx * EltSize) % 128) != 0) return SDValue(); - MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); + // The shuffle input might have been a bitcast we looked through; look at + // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll + // later bitcast it to BroadcastVT. + MVT SrcVT = V.getSimpleValueType(); + assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + "Unexpected vector element size"); + assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) && + "Unexpected vector size"); + + MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize); V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, DAG.getIntPtrConstant(BroadcastIdx, DL)); } @@ -9642,6 +10151,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } + // We only support broadcasting from 128-bit vectors to minimize the + // number of patterns we need to deal with in isel. So extract down to + // 128-bits. + if (SrcVT.getSizeInBits() > 128) + V = extract128BitVector(V, 0, DAG, DL); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -9653,7 +10168,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // elements are zeroable. static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); @@ -9742,7 +10257,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); @@ -9877,7 +10392,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9959,7 +10474,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10178,7 +10693,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10261,7 +10776,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10353,7 +10868,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build - // up the inputs, bypassing domain shift penalties that we would encur if we + // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); @@ -10384,18 +10899,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); - assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); + assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); SmallVector<int, 4> LoInputs; - std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0; }); + copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 0; }); + copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = @@ -10530,9 +11043,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); - V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, - MVT::v8i16, V, - getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); + V = DAG.getNode( + FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, + MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) @@ -10574,7 +11088,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); - else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from @@ -10830,7 +11344,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse, + const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; @@ -10891,7 +11405,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11075,7 +11589,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11132,14 +11646,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector<int, 4> LoInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0 && M < 8; }); + copy_if(Mask, std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 8; }); + copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); @@ -11193,7 +11706,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && - "Conflicting entrties in the original shuffle!"); + "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, @@ -11365,7 +11878,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { @@ -11621,7 +12134,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<int, 4> WidenedMask; @@ -11647,18 +12160,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding. + // With AVX2, use VPERMQ/VPERMPD to allow memory folding. if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, - DAG.getIntPtrConstant(0, DL)); - SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - OnlyUsesV1 ? V1 : V2, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, + // this will likely become vinsertf128 which can't fold a 256-bit memop. + if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0, DL)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } } } @@ -12091,7 +12608,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, unsigned &ShuffleImm, ArrayRef<int> Mask) { int NumElts = VT.getVectorNumElements(); - assert(VT.getScalarType() == MVT::f64 && + assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); @@ -12127,6 +12644,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& + "Unexpected data type for VSHUFPD"); + unsigned Immediate = 0; if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); @@ -12153,7 +12673,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12250,7 +12770,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12338,7 +12858,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12414,6 +12934,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code using vpunpcklwd and + // vpunpckhwd instrs than vblend. + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) @@ -12429,7 +12957,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12445,6 +12973,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code than vblend by using + // vpunpcklwd and vpunpckhwd instrs. + if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && + !Subtarget.hasAVX512()) + if (SDValue V = + lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; @@ -12533,7 +13070,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12619,7 +13156,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12692,7 +13229,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we @@ -12844,7 +13381,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12891,12 +13428,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, +static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12925,6 +13466,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12938,7 +13483,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12994,12 +13539,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13062,12 +13611,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13109,12 +13661,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13159,6 +13715,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -13170,7 +13730,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && @@ -13251,7 +13811,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ISD::isBuildVectorAllZeros(V1.getNode())) V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V1.getNode())) - V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V1 = getOnesVector(ExtVT, DAG, DL); else V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); @@ -13260,7 +13820,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, else if (ISD::isBuildVectorAllZeros(V2.getNode())) V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V2.getNode())) - V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V2 = getOnesVector(ExtVT, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); @@ -13392,8 +13952,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - if (Zeroable.all()) + APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but @@ -13474,6 +14034,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) @@ -13483,10 +14048,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition + // into an i1 condition so that we can use the mask-based 512-bit blend + // instructions. + if (VT.getSizeInBits() == 512) { + SDValue Cond = Op.getOperand(0); + // The vNi1 condition case should be handled above as it can be trivially + // lowered. + assert(Cond.getValueType().getScalarSizeInBits() == + VT.getScalarSizeInBits() && + "Should have a size-matched integer condition!"); + // Build a mask by testing the condition against itself (tests for zero). + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); + // Now return a new VSELECT using the mask. + return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. - switch (Op.getSimpleValueType().SimpleTy) { + switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; @@ -13564,15 +14149,18 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const SDValue Idx = Op.getOperand(1); MVT EltVT = Op.getSimpleValueType(); - assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, - // extend vector to VR512 + // extend vector to VR512/128 if (!isa<ConstantSDNode>(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + unsigned NumElts = VecVT.getVectorNumElements(); + // Extending v8i1/v16i1 to 512-bit get better performance on KNL + // than extending to 128/256bit. + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); @@ -13590,12 +14178,12 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned MaxSift = VecVT.getVectorNumElements() - 1; if (MaxSift - IdxVal) - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -13606,28 +14194,40 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); - if (Op.getSimpleValueType() == MVT::i1) + if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG); if (!isa<ConstantSDNode>(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { - - MVT MaskEltVT = - MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); + // Its more profitable to go through memory (1 cycles throughput) + // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) + // IACA tool was used to get performance estimation + // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) + // + // example : extractelement <16 x i8> %a, i32 %i + // + // Block Throughput: 3.00 Cycles + // Throughput Bottleneck: Port5 + // + // | Num Of | Ports pressure in cycles | | + // | Uops | 0 - DV | 5 | 6 | 7 | | + // --------------------------------------------- + // | 1 | | 1.0 | | | CP | vmovd xmm1, edi + // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 + // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 + // Total Num Of Uops: 4 + // + // + // Block Throughput: 1.00 Cycles + // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 + // + // | | Ports pressure in cycles | | + // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | + // --------------------------------------------------------- + // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 + // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] + // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] + // Total Num Of Uops: 4 - Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, - DAG.getConstant(0, dl, PtrVT)); - } return SDValue(); } @@ -13675,7 +14275,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: handle v16i8. + // TODO: We only extract a single element from v16i8, we can probably afford + // to be more aggressive here before using the default approach of spilling to + // stack. + if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { + // Extract either the lowest i32 or any i16, and extract the sub-byte. + int DWordIdx = IdxVal / 4; + if (DWordIdx == 0) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + DAG.getIntPtrConstant(DWordIdx, dl)); + int ShiftVal = (IdxVal % 4) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, + DAG.getConstant(ShiftVal, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + int WordIdx = IdxVal / 2; + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) @@ -13734,31 +14360,35 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if(Vec.isUndef()) { if (IdxVal) - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return EltInVec; } - // Insertion of one bit into first or last position - // can be done with two SHIFTs + OR. + // Insertion of one bit into first position if (IdxVal == 0 ) { - // EltInVec already at correct index and other bits are 0. + // Clean top bits of vector. + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); // Clean the first bit in source vector. - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } + // Insertion of one bit into last position if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); @@ -13790,17 +14420,20 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, auto *N2C = cast<ConstantSDNode>(N2); unsigned IdxVal = N2C->getZExtValue(); - // If we are clearing out a element, we do this more efficiently with a - // blend shuffle than a costly integer insertion. - // TODO: would other rematerializable values (e.g. allbits) benefit as well? - // TODO: pre-SSE41 targets will tend to use bit masking - this could still - // be beneficial if we are inserting several zeros and can combine the masks. - if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { - SmallVector<int, 8> ClearMask; + bool IsZeroElt = X86::isZeroNode(N1); + bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); + + // If we are inserting a element, see if we can do this more efficiently with + // a blend shuffle with a rematerializable vector than a costly integer + // insertion. + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && + 16 <= EltVT.getSizeInBits()) { + SmallVector<int, 8> BlendMask; for (unsigned i = 0; i != NumElts; ++i) - ClearMask.push_back(i == IdxVal ? i + NumElts : i); - SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) + : DAG.getConstant(-1, dl, VT); + return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert @@ -13837,25 +14470,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget.hasSSE41()) { - if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { - unsigned Opc; - if (VT == MVT::v8i16) { - Opc = X86ISD::PINSRW; - } else { - assert(VT == MVT::v16i8); - Opc = X86ISD::PINSRB; - } - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. SSE41 required for pinsrb. + if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { + unsigned Opc; + if (VT == MVT::v8i16) { + assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); + assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); + Opc = X86ISD::PINSRB; } + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into @@ -13885,36 +14520,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } - if (EltVT == MVT::i32 || EltVT == MVT::i64) { - // PINSR* works with constant index. + // PINSR* works with constant index. + if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; - } } - if (EltVT == MVT::i8) - return SDValue(); - - if (EltVT.getSizeInBits() == 16) { - // Transform it so it match pinsrw which expects a 16-bit value in a GR32 - // as its second argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); - } return SDValue(); } -static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); + // It's always cheaper to replace a xor+movd with xorps and simplifies further + // combines. + if (X86::isZeroNode(Op.getOperand(0))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. - unsigned SizeFactor = OpVT.getSizeInBits()/128; + unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); @@ -13923,9 +14551,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + + // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. + if (OpVT == MVT::v4i32) + return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } @@ -13943,24 +14575,33 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); MVT ResVT = Op.getSimpleValueType(); + // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond + // would result with: v1i1 = extract_subvector(vXi1, idx). + // Lower these into extract_vector_elt which is already selectable. + if (ResVT == MVT::v1i1) { + assert(Subtarget.hasAVX512() && + "Boolean EXTRACT_SUBVECTOR requires AVX512"); + + MVT EltVT = ResVT.getVectorElementType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT LegalVT = + (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT(); + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx); + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res); + } + assert((In.getSimpleValueType().is256BitVector() || In.getSimpleValueType().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors"); - if (ResVT.is128BitVector()) - return extract128BitVector(In, IdxVal, DAG, dl); - if (ResVT.is256BitVector()) - return extract256BitVector(In, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); -} + // If the input is a buildvector just emit a smaller one. + unsigned ElemsPerChunk = ResVT.getVectorNumElements(); + if (In.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getBuildVector( + ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); -static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) - if (llvm::all_of(ValidUsers, - [&I](SDValue V) { return V.getNode() != *I; })) - return false; - return true; + // Everything else is legal. + return Op; } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a @@ -13968,83 +14609,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX"); - - SDLoc dl(Op); - SDValue Vec = Op.getOperand(0); - SDValue SubVec = Op.getOperand(1); - SDValue Idx = Op.getOperand(2); - - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT OpVT = Op.getSimpleValueType(); - MVT SubVecVT = SubVec.getSimpleValueType(); - - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); - - assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && - "Can only insert into 256-bit or 512-bit vectors"); - - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) - if ((IdxVal == OpVT.getVectorNumElements() / 2) && - Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); - if (Idx2 && Idx2->getZExtValue() == 0) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; - } - } - // If lower/upper loads are the same and the only users of the load, then - // lower to a VBROADCASTF128/VBROADCASTI128/etc. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && - areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - } - } - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - } - } - } + assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); - if (SubVecVT.is128BitVector()) - return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - - if (SubVecVT.is256BitVector()) - return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); + return insert1BitVector(Op, DAG, Subtarget); } // Returns the appropriate wrapper opcode for a global reference. @@ -14062,7 +14629,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as -// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected @@ -14417,7 +14984,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), @@ -14438,7 +15005,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget.isTargetWindowsItanium() || Subtarget.isTargetWindowsGNU()) { // Just use the implicit TLS architecture - // Need to generate someting similar to: + // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) @@ -15040,8 +15607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, - Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. @@ -15313,7 +15879,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); if (VT == ExtVT) return SelectedVal; return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); @@ -15489,32 +16055,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, - DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } - // Truncate with PACKSS if we are truncating a vector comparison result. - // TODO: We should be able to support other operations as long as we - // we are saturating+packing zero/all bits only. - auto IsPackableComparison = [](SDValue V) { - unsigned Opcode = V.getOpcode(); - return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ || - Opcode == X86ISD::CMPP); - }; - - if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS && - all_of(In->ops(), IsPackableComparison))) { + // Truncate with PACKSS if we are truncating a vector zero/all-bits result. + if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget)) return V; - } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); - In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), - ShufMask); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } @@ -15530,30 +16085,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { - // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); - SmallVector<SDValue,32> pshufbMask; - for (unsigned i = 0; i < 2; ++i) { - pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); - for (unsigned j = 0; j < 8; ++j) - pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); - } - SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask); - In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + // The PSHUFB mask: + static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1, + 16, 17, 20, 21, 24, 25, 28, 29, + -1, -1, -1, -1, -1, -1, -1, -1 }; + In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); - static const int ShufMask[] = {0, 2, -1, -1}; - In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), - ShufMask); + static const int ShufMask2[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); @@ -15572,9 +16117,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; - SDValue Undef = DAG.getUNDEF(MVT::v16i8); - OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); @@ -15598,17 +16142,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; - SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), - DAG.getUNDEF(NVT), MaskVec); + In = DAG.getBitcast(NVT, In); + SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } -SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { @@ -15616,8 +16157,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SDValue Src = Op.getOperand(0); SDLoc dl(Op); if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, - dl, VT, + return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); } @@ -15701,7 +16241,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { unsigned EltBits = EltVT.getSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = - IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = EltVT == MVT::f64 ? APFloat::IEEEdouble() : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle()); @@ -15764,9 +16304,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( - APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( - APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT); + APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) @@ -15891,7 +16431,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); - // If more than one full vectors are evaluated, OR them first before PTEST. + // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. @@ -15900,8 +16440,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. @@ -15970,11 +16509,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::ADD: case ISD::SUB: case ISD::MUL: - case ISD::SHL: { - const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode()); - if (BinNode->Flags.hasNoSignedWrap()) + case ISD::SHL: + if (Op.getNode()->getFlags().hasNoSignedWrap()) break; - } + LLVM_FALLTHROUGH; default: NeedOF = true; break; @@ -16366,7 +16904,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, } /// If we have at least two divisions that use the same divisor, convert to -/// multplication by a reciprocal. This may need to be adjusted for a given +/// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the @@ -16432,9 +16970,9 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { - APInt Zeros, Ones; - DAG.computeKnownBits(Op0, Zeros, Ones); - if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + KnownBits Known; + DAG.computeKnownBits(Op0, Known); + if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; @@ -16682,7 +17220,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); - ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); @@ -16709,18 +17247,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is // available. SDValue Cmp; - unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); + unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); if (SSECC == 8) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; - if (SetCCOpcode == ISD::SETUEQ) { + if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : static_cast<unsigned>(ISD::OR); } else { - assert(SetCCOpcode == ISD::SETONE); + assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : @@ -16767,7 +17305,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // 2. The original operand type has been promoted to a 256-bit vector. // // Note that condition 2. only applies for AVX targets. - SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); + SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond); return DAG.getZExtOrTrunc(NewOp, dl, VT); } @@ -16807,7 +17345,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; - switch (SetCCOpcode) { + switch (Cond) { default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; @@ -16822,60 +17360,55 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } // Are we comparing unsigned or signed integers? - unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) - ? X86ISD::VPCOMU : X86ISD::VPCOM; + unsigned Opc = + ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(CmpMode, dl, MVT::i8)); } - // We are handling one of the integer comparisons here. Since SSE only has + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc; - bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; - bool Subus = false; - - switch (SetCCOpcode) { - default: llvm_unreachable("Unexpected SETCC condition"); - case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; - case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = X86ISD::PCMPGT; break; - case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = X86ISD::PCMPGT; - Invert = true; break; - case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = X86ISD::PCMPGT; - FlipSigns = true; break; - case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = X86ISD::PCMPGT; - FlipSigns = true; Invert = true; break; - } + unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ + : X86ISD::PCMPGT; + bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || + Cond == ISD::SETGE || Cond == ISD::SETUGE; + bool Invert = Cond == ISD::SETNE || + (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); + + // If both operands are known non-negative, then an unsigned compare is the + // same as a signed compare and there's no need to flip signbits. + // TODO: We could check for more general simplifications here since we're + // computing known bits. + bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && + !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); - bool hasMinMax = - (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) - || (Subtarget.hasSSE2() && (VET == MVT::i8)); - - if (hasMinMax) { - switch (SetCCOpcode) { + bool HasMinMax = + (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || + (Subtarget.hasSSE2() && (VET == MVT::i8)); + bool MinMax = false; + if (HasMinMax) { + switch (Cond) { default: break; case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; } - if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + if (MinMax) + Swap = Invert = FlipSigns = false; } - bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); - if (!MinMax && hasSubus) { + bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + bool Subus = false; + if (!MinMax && HasSubus) { // As another special case, use PSUBUS[BW] when it's profitable. E.g. for // Op0 u<= Op1: // t = psubus Op0, Op1 // pcmpeq t, <0..0> - switch (SetCCOpcode) { + switch (Cond) { default: break; case ISD::SETULT: { // If the comparison is against a constant we can turn this into a @@ -16977,10 +17510,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); - SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, + SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); - Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); - Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); @@ -17005,8 +17538,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); - assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) - && "SetCC type must be 8-bit or 1-bit integer"); + assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDLoc dl(Op); @@ -17070,19 +17602,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } -SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); - assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); - assert(Carry.getOpcode() != ISD::CARRY_FALSE); + // Recreate the carry if needed. + EVT CarryVT = Carry.getValueType(); + APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), + Carry, DAG.getConstant(NegOne, DL, CarryVT)); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); - SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); if (Op.getSimpleValueType() == MVT::i1) return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); @@ -17140,7 +17677,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SSECC != 8) { if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0, + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); @@ -17176,7 +17713,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); - SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); @@ -17188,9 +17725,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) && - Subtarget.hasAVX512()) - return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2); + if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { SDValue Op1Scalar; @@ -17204,9 +17742,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); + SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, + Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); @@ -17221,8 +17758,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, - Cond, Op1, Op2); + SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } @@ -17241,33 +17777,33 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y + // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - - unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + unsigned CondCode = + cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; - SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb if (isNullConstant(Y) && - (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { - SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, - DAG.getConstant(0, DL, - CmpOp0.getValueType()), - CmpOp0); - SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), - SDValue(Neg.getNode(), 1)); - return Res; - } + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); @@ -17283,6 +17819,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; + } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && + Cmp.getOperand(0).getOpcode() == ISD::AND && + isOneConstant(Cmp.getOperand(0).getOperand(1))) { + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue Src1, Src2; + // true if Op2 is XOR or OR operator and one of its operands + // is equal to Op1 + // ( a , a op b) || ( b , a op b) + auto isOrXorPattern = [&]() { + if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && + (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { + Src1 = + Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); + Src2 = Op1; + return true; + } + return false; + }; + + if (isOrXorPattern()) { + SDValue Neg; + unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); + // we need mask of all zeros or ones with same size of the other + // operands. + if (CmpSz > VT.getSizeInBits()) + Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); + else if (CmpSz < VT.getSizeInBits()) + Neg = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), + DAG.getConstant(1, DL, VT)); + else + Neg = CmpOp0; + SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Neg); // -(and (x, 0x1)) + SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z + return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y + } } } @@ -17423,17 +17996,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || - - ((Subtarget.hasBWI() && VT.is512BitVector() && - VTElt.getSizeInBits() <= 16)) || + (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget.hasDQI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - ((Subtarget.hasDQI() && VT.is512BitVector() && - VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned NumElts = VT.getVectorNumElements(); @@ -17441,8 +18007,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, if (VT.is512BitVector() && InVTElt != MVT::i1 && (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) - return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG); + return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG); } if (InVTElt != MVT::i1) @@ -17454,12 +18020,12 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SDValue V; if (Subtarget.hasDQI()) { - V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); assert(!VT.is512BitVector() && "Unexpected vector type"); } else { - SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); + SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) return V; } @@ -17506,11 +18072,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); - // SSE41 targets can use the pmovsx* instructions directly. - unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? - X86ISD::VSEXT : X86ISD::VZEXT; - if (Subtarget.hasSSE41()) + // SSE41 targets can use the pmovsx* instructions directly for 128-bit results, + // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still + // need to be handled here for 256/512-bit results. + if (Subtarget.hasInt256()) { + assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); + } // We should only get here for sign extend. assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && @@ -17595,8 +18165,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); + OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT); + OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } @@ -17674,7 +18244,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, MVT VT = Op.getValueType().getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); - if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + if ((Subtarget.hasBWI() && NumElts >= 32) || + (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { @@ -17703,7 +18274,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, if (NumElts <= 8) { // A subset, assume that we have only AVX-512F - unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), @@ -17911,7 +18482,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget.hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } @@ -18243,8 +18814,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); + bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack; + SplitStack || EmitStackProbe; SDLoc dl(Op); // Get the inputs. @@ -18256,7 +18828,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); @@ -18469,6 +19041,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Bitcast the source vector to the output type, this is mainly necessary for + // vXi8/vXi64 shifts. + if (VT != SrcOp.getSimpleValueType()) + SrcOp = DAG.getBitcast(VT, SrcOp); + // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; @@ -18485,9 +19062,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a - // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. - if (VT == SrcOp.getSimpleValueType() && - ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + // vector of Constants or UNDEFs. + if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector<SDValue, 8> Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; @@ -18578,11 +19154,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { ShAmt = ShAmt.getOperand(0); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -18692,8 +19268,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is coming as MVT::i8 and it should be truncated -/// to MVT::i1 while lowering masking intrinsics. +/// The mask is coming as MVT::i8 and it should be transformed +/// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. @@ -18701,19 +19277,19 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; + + if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) + if (MaskConst->getZExtValue() & 0x1) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_RND) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); - if (Op.getOpcode() == X86ISD::VFPCLASS || - Op.getOpcode() == X86ISD::VFPCLASSS) + if (Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::OR, dl, VT, Op, IMask); if (PreservedSrc.isUndef()) @@ -18762,7 +19338,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); @@ -18853,6 +19429,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, VT, Src1, Src2, Rnd), + Mask, passThru, Subtarget, DAG); + } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } @@ -19142,10 +19726,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); - SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: case CMP_MASK_CC: { @@ -19205,18 +19790,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); if (!isRoundModeCurDirection(Rnd)) - Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd); } //default rounding mode if(!Cmp.getNode()) - Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -19264,13 +19849,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue FCmp; if (isRoundModeCurDirection(Sae)) - FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8)); else - FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); - // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg" - return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp); + FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, + DAG.getConstant(CondVal, dl, MVT::i8), Sae); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + DAG.getIntPtrConstant(0, dl)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -19306,6 +19891,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src2, Src1); return DAG.getBitcast(VT, Res); } + case MASK_BINOP: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2); + return DAG.getBitcast(VT, Res); + } case FIXUPIMMS: case FIXUPIMMS_MASKZ: case FIXUPIMM: @@ -19347,12 +19941,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case CONVERT_MASK_TO_VEC: { - SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask); - } case BRCST_SUBVEC_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); @@ -19478,6 +20066,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_knot_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kandn_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + // Invert LHS for the not. + LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, + DAG.getConstant(1, dl, MVT::v16i1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kxnor_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + // Invert result for the not. + Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res, + DAG.getConstant(1, dl, MVT::v16i1)); + return DAG.getBitcast(MVT::i16, Res); + } + case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -19569,7 +20184,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Op1 = Op.getOperand(1); auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. @@ -19603,12 +20218,40 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget } } +static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { + SDLoc dl(Op); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + EVT MaskVT = Mask.getValueType(); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, dl); +} + static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -19617,7 +20260,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - if (Src.isUndef()) + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -19630,7 +20276,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -19649,14 +20298,16 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); @@ -19884,16 +20535,17 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); + const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + switch (IntNo) { + case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); - if (IntNo == llvm::Intrinsic::x86_seh_ehguard) + case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); - if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || - IntNo == llvm::Intrinsic::x86_flags_read_u64 || - IntNo == llvm::Intrinsic::x86_flags_write_u32 || - IntNo == llvm::Intrinsic::x86_flags_write_u64) { + case llvm::Intrinsic::x86_flags_read_u32: + case llvm::Intrinsic::x86_flags_read_u64: + case llvm::Intrinsic::x86_flags_write_u32: + case llvm::Intrinsic::x86_flags_write_u64: { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); @@ -19902,6 +20554,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, // during ExpandISelPseudos in EmitInstrWithCustomInserter. return SDValue(); } + case Intrinsic::x86_lwpins32: + case Intrinsic::x86_lwpins64: { + SDLoc dl(Op); + SDValue Chain = Op->getOperand(0); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + SDValue LwpIns = + DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2), + Op->getOperand(3), Op->getOperand(4)); + SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG); + SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, + LwpIns.getValue(1)); + } + } return SDValue(); } @@ -19928,6 +20594,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + case GATHER_AVX2: { + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain, Subtarget); + } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); @@ -19953,8 +20629,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); - assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); + assert((HintVal == 2 || HintVal == 3) && + "Wrong prefetch hint in intrinsic: should be 2 or 3"); + unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -19994,8 +20671,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, } // ADC/ADCX/SBB case ADX: { - SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); - SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); + SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); + SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), @@ -20368,7 +21045,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); - const AttributeSet &Attrs = Func->getAttributes(); + const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; @@ -20503,54 +21180,62 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } +// Split an unary integer op into 2 half sized ops. +static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + unsigned NumElems = VT.getVectorNumElements(); + unsigned SizeInBits = VT.getSizeInBits(); + + // Extract the Lo/Hi vectors + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2); + SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, Lo), + DAG.getNode(Op.getOpcode(), dl, NewVT, Hi)); +} + +// Decompose 256-bit ops into smaller 128-bit ops. +static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return LowerVectorIntUnary(Op, DAG); +} + +// Decompose 512-bit ops into smaller 256-bit ops. +static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is512BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 512-bit vector integer operation"); + return LowerVectorIntUnary(Op, DAG); +} + /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. // -// 1. i32/i64 128/256-bit vector (native support require VLX) are expended -// to 512-bit vector. -// 2. i8/i16 vector implemented using dword LZCNT vector instruction -// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, -// split the vector, perform operation on it's Lo a Hi part and -// concatenate the results. -static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { +// i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); - if (EltVT == MVT::i64 || EltVT == MVT::i32) { - // Extend to 512 bit vector. - assert((VT.is256BitVector() || VT.is128BitVector()) && - "Unsupported value type for operation"); - - MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); - SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, - DAG.getUNDEF(NewVT), - Op.getOperand(0), - DAG.getIntPtrConstant(0, dl)); - SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, - DAG.getIntPtrConstant(0, dl)); - } - assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); - if (16 < NumElems) { - // Split vector, it's Lo and Hi parts will be handled in next iteration. - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); - - Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo); - Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); - } + // Split vector, it's Lo and Hi parts will be handled in next iteration. + if (16 < NumElems) + return LowerVectorIntUnary(Op, DAG); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); - assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); @@ -20637,23 +21322,17 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - SDValue Op0 = Op.getOperand(0); - if (Subtarget.hasAVX512()) - return LowerVectorCTLZ_AVX512(Op, DAG); + if (Subtarget.hasCDI()) + return LowerVectorCTLZ_AVX512CDI(Op, DAG); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - unsigned NumElems = VT.getVectorNumElements(); + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - // Extract each 128-bit vector, perform ctlz and concat the result. - SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS), - DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS)); - } + // Decompose 512-bit ops into smaller 256-bit ops. + if (VT.is512BitVector() && !Subtarget.hasBWI()) + return Lower512IntUnary(Op, DAG); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); @@ -20802,9 +21481,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), +static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getScalarType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && @@ -20812,14 +21492,11 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } -static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Op.getOperand(1)); +static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - return Lower256IntArith(Op, DAG); + return Lower256IntUnary(Op, DAG); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { @@ -20834,7 +21511,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - if (VT == MVT::i1) + if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. @@ -20874,8 +21551,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); - BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); + BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -20894,8 +21571,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); - BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); + BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21056,8 +21733,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } - SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); - SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); + SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG); + SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, DAG.getConstant(8, dl, MVT::v16i16)); @@ -21073,8 +21750,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(ExSSE41, dl, ExVT, A); - BLo = DAG.getNode(ExSSE41, dl, ExVT, B); + ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG); + BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -21093,8 +21770,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); - BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG); + BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21148,8 +21825,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); } @@ -21157,11 +21834,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), - static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, std::move(Args)) - .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee( + getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, + std::move(Args)) + .setInRegister() + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); @@ -21269,15 +21950,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() < 16) return false; - if (VT.is512BitVector() && + if (VT.is512BitVector() && Subtarget.hasAVX512() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; - bool LShift = VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256()); + bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()); - bool AShift = LShift && (Subtarget.hasVLX() || - (VT != MVT::v2i64 && VT != MVT::v4i64)); + bool AShift = LShift && (Subtarget.hasAVX512() || + (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -21301,7 +21982,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (VT.is512BitVector() || Subtarget.hasVLX()) + if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); @@ -21324,6 +22005,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); + // ashr(R, 63) === cmp_slt(R, 0) + if (ShiftAmt == 63 && Subtarget.hasSSE42()) { + assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && + "Unsupported PCMPGT op"); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, + getZeroVector(VT, Subtarget, DAG, dl), R); + } + if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = @@ -21360,8 +22049,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); // i64 SRA needs to be performed as partial shifts. - if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) && - Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP()) + if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || + (Subtarget.hasInt256() && VT == MVT::v4i64)) && + Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || @@ -21422,10 +22112,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. + // TODO: Replace constant extraction with getTargetConstantBitsFromNode. if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { + // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant. + unsigned SubVectorScale = 1; + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + SubVectorScale = + Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits(); + Amt = Amt.getOperand(0); + } + // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) @@ -21442,7 +22141,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / - VT.getVectorNumElements(); + (SubVectorScale * VT.getVectorNumElements()); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); @@ -21610,11 +22309,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } // i64 vector arithmetic shift can be emulated with the transform: - // M = lshr(SIGN_BIT, Amt) + // M = lshr(SIGN_MASK, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Op.getOpcode() == ISD::SRA) { - SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); @@ -21816,23 +22515,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); - return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; @@ -21954,15 +22651,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); - return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); - return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; @@ -22017,10 +22713,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + + if (Subtarget.hasAVX512()) { + // Attempt to rotate by immediate. + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) { + if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) { + return EltBits[0] == V; + })) { + unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits); + return DAG.getNode(Op, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Else, fall-back on VPROLV/VPRORV. + return Op; + } assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); - assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right. @@ -22035,7 +22752,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); - assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + assert(RotateAmt < EltSizeInBits && "Rotation out of range"); return DAG.getNode(X86ISD::VPROTI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } @@ -22062,10 +22779,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::INC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; @@ -22077,10 +22794,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::DEC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; @@ -22146,7 +22863,7 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { - auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + auto PTy = cast<PointerType>(LI->getPointerOperandType()); return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } @@ -22202,7 +22919,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - auto SynchScope = AI->getSynchScope(); + auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); @@ -22224,7 +22941,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SynchScope == SingleThread) + if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; @@ -22243,7 +22960,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); - Loaded->setAtomic(Order, SynchScope); + Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; @@ -22254,13 +22971,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && - FenceScope == CrossThread) { + FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); @@ -22470,7 +23187,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and - // masked out higher ones) for each byte. PSHUB is used separately with both + // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. // @@ -22588,35 +23305,33 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); + // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. + if (Subtarget.hasVPOPCNTDQ()) { + if (VT == MVT::v8i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + if (VT == MVT::v16i8 || VT == MVT::v16i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + } + if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); } - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - unsigned NumElems = VT.getVectorNumElements(); - - // Extract each 128-bit vector, compute pop count and concat the result. - SDValue LHS = extract128BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL); - - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), - LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); - } - - if (VT.is512BitVector() && !Subtarget.hasBWI()) { - unsigned NumElems = VT.getVectorNumElements(); - - // Extract each 256-bit vector, compute pop count and concat the result. - SDValue LHS = extract256BitVector(Op0, 0, DAG, DL); - SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL); + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), - LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); - } + // Decompose 512-bit ops into smaller 256-bit ops. + if (VT.is512BitVector() && !Subtarget.hasBWI()) + return Lower512IntUnary(Op, DAG); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } @@ -22643,20 +23358,12 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { DAG.getIntPtrConstant(0, DL)); } - MVT SVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector()) { - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); - - MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo), - DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi)); - } + if (VT.is256BitVector()) + return Lower256IntUnary(Op, DAG); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); @@ -22697,14 +23404,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, "Only byte vector BITREVERSE supported"); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. - if (VT.is256BitVector() && !Subtarget.hasInt256()) { - MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2); - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL); - Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo); - Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each @@ -22824,30 +23525,33 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getNode()->getSimpleValueType(0); +static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + MVT VT = N->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); + SDLoc DL(N); - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: llvm_unreachable("Invalid code"); - case ISD::ADDC: Opc = X86ISD::ADD; break; - case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; - case ISD::SUBC: Opc = X86ISD::SUB; break; - case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; - } + // Set the carry flag. + SDValue Carry = Op.getOperand(2); + EVT CarryVT = Carry.getValueType(); + APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); + Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), + Carry, DAG.getConstant(NegOne, DL, CarryVT)); + + unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB; + SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0), + Op.getOperand(1), Carry.getValue(1)); + + SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG); + if (N->getValueType(1) == MVT::i1) + SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, @@ -22867,8 +23571,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; @@ -22880,13 +23584,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) - : (Type*)VectorType::get(ArgTy, 4); + Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) + : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -22923,8 +23627,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); - EVT EltVT = NVT.getVectorElementType(); - SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { @@ -22942,6 +23644,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); + EVT EltVT = InOp.getOperand(0).getValueType(); + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) @@ -23086,7 +23790,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23142,7 +23846,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23202,7 +23906,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); - // The pass-thru value + // The pass-through value MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src0 = ExtendToType(Src0, NewVT, DAG); @@ -23216,6 +23920,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SDValue RetOps[] = {Exract, NewGather.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } + if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { + // There is a special case when the return type is v2i32 is illegal and + // the type legaizer extended it to v2i64. Without this conversion we end up + // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. + // In order to avoid this situation, we'll build an X86 specific Gather node + // with index v2i64 and value type v4i32. + assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && + "Unexpected type in masked gather"); + Src0 = DAG.getVectorShuffle(MVT::v4i32, dl, + DAG.getBitcast(MVT::v4i32, Src0), + DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); + // The mask should match the destination type. Extending mask with zeroes + // is not necessary since instruction itself reads only two values from + // memory. + Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64, + NewGather.getValue(0), DAG); + SDValue RetOps[] = { Sext, NewGather.getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + } + if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { + // This transformation is for optimization only. + // The type legalizer extended mask and index to 4 elements vector + // in order to match requirements of the common gather node - same + // vector width of index and value. X86 Gather node allows mismatch + // of vector width in order to select more optimal instruction at the + // end. + assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && + "Unexpected type in masked gather"); + if (Mask.getOpcode() == ISD::CONCAT_VECTORS && + ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && + Index.getOpcode() == ISD::CONCAT_VECTORS && + Index.getOperand(1).isUndef()) { + Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); + Index = Index.getOperand(0); + } else + return Op; + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + + SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) }; + return DAG.getMergeValues(RetOps, dl); + + } return Op; } @@ -23284,7 +24039,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); - case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); @@ -23303,7 +24058,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: @@ -23311,7 +24066,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); - case ISD::SETCCE: return LowerSETCCE(Op, DAG); + case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -23344,7 +24099,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); - case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); + case ISD::ROTL: + case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -23356,16 +24112,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); - case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::ADDCARRY: + case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); + case ISD::ADD: + case ISD::SUB: return LowerADD_SUB(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); @@ -23768,7 +24523,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; - case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; @@ -23779,16 +24533,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMAXS: return "X86ISD::FMAXS"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMINS: return "X86ISD::FMINS"; case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; + case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; + case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; @@ -23827,7 +24584,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; @@ -23876,6 +24632,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; + case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -23976,9 +24734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; @@ -24012,6 +24774,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; + case X86ISD::LWPINS: return "X86ISD::LWPINS"; + case X86ISD::MGATHER: return "X86ISD::MGATHER"; } return nullptr; } @@ -24236,16 +25000,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, // xbegin sinkMBB // // mainMBB: - // eax = -1 + // s0 = -1 + // + // fallBB: + // eax = # XABORT_DEF + // s1 = eax // // sinkMBB: - // v = eax + // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); + MF->insert(I, fallMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. @@ -24253,25 +25023,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned DstReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned fallDstReg = MRI.createVirtualRegister(RC); + // thisMBB: - // xbegin sinkMBB + // xbegin fallMBB // # fallthrough to mainMBB - // # abortion to sinkMBB - BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + // # abortion to fallMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); - thisMBB->addSuccessor(sinkMBB); + thisMBB->addSuccessor(fallMBB); // mainMBB: - // EAX = -1 - BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + // mainDstReg := -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); + BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); - // sinkMBB: - // EAX is live into the sinkMBB - sinkMBB->addLiveIn(X86::EAX); - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY), - MI.getOperand(0).getReg()) + // fallMBB: + // ; pseudo instruction to model hardware's definition from XABORT + // EAX := XABORT_DEF + // fallDstReg := EAX + BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); + BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); + fallMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) + BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; @@ -24302,7 +25087,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24338,7 +25123,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24398,7 +25183,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) @@ -24413,6 +25198,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + // Address into RAX/EAX + unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.add(MI->getOperand(i)); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + + + MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -24536,12 +25341,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -24561,12 +25366,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 16) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 16) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -24588,13 +25393,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .addReg(NextOffsetReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -24608,12 +25413,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -24644,13 +25449,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .addReg(NextAddrReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -24867,7 +25672,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, // // (CMOV (CMOV F, T, cc1), T, cc2) // - // to two successives branches. For that, we look for another CMOV as the + // to two successive branches. For that, we look for another CMOV as the // following instruction. // // Without this, we would add a PHI between the two jumps, which ends up @@ -25123,12 +25928,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, // instruction using the same address operands. if (Operand.isReg()) Operand.setIsKill(false); - MIB.addOperand(Operand); + MIB.add(Operand); } MachineInstr *FOpMI = MIB; MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -25331,7 +26136,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = - BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. @@ -25414,6 +26219,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -25430,7 +26236,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - assert(RC->hasType(MVT::i32) && "Invalid destination!"); + assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); + (void)TRI; unsigned mainDstReg = MRI.createVirtualRegister(RC); unsigned restoreDstReg = MRI.createVirtualRegister(RC); @@ -25508,7 +26315,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else - MIB.addOperand(MI.getOperand(MemOpndSlot + i)); + MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); @@ -25591,7 +26398,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); @@ -25599,7 +26406,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), LabelOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP @@ -25608,7 +26415,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump @@ -25625,7 +26432,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -25644,8 +26451,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; - /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */ - if (Subtarget.is64Bit()) BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) @@ -25655,7 +26460,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, .addReg(0); else BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) - .addReg(0) /* XII->getGlobalBaseReg(MF) */ + .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) @@ -25677,7 +26482,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're @@ -25749,9 +26554,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); unsigned MJTI = JTI->createJumpTableIndex(LPadList); - const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); - const X86RegisterInfo &RI = XII->getRegisterInfo(); - + const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { @@ -25799,8 +26602,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector<MachineBasicBlock *, 64> MBBLPads; - const MCPhysReg *SavedRegs = - Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF); + const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. @@ -26033,6 +26835,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); case X86::MONITORX: return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); + + // Cache line zero + case X86::CLZERO: + return emitClzero(&MI, BB, Subtarget); + // PKU feature case X86::WRPKRU: return emitWRPKRU(MI, BB, Subtarget); @@ -26068,6 +26875,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); + case TargetOpcode::PATCHABLE_EVENT_CALL: + // Do nothing here, handle in xray instrumentation pass. + return BB; + case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B @@ -26135,12 +26946,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, //===----------------------------------------------------------------------===// void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || @@ -26148,7 +26960,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + Known.resetAll(); switch (Opc) { default: break; case X86ISD::ADD: @@ -26167,44 +26979,101 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; LLVM_FALLTHROUGH; case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + Known.Zero.setBitsFrom(1); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); + Known.Zero.setBitsFrom(NumLoBits); + break; + } + case X86ISD::VSHLI: + case X86ISD::VSRLI: { + if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { + Known.setAllZero(); + break; + } + + DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); + unsigned ShAmt = ShiftImm->getZExtValue(); + if (Opc == X86ISD::VSHLI) { + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; + // Low bits are known zero. + Known.Zero.setLowBits(ShAmt); + } else { + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + // High bits are known zero. + Known.Zero.setHighBits(ShAmt); + } + } break; } case X86ISD::VZEXT: { SDValue N0 = Op.getOperand(0); - unsigned NumElts = Op.getValueType().getVectorNumElements(); - unsigned InNumElts = N0.getValueType().getVectorNumElements(); - unsigned InBitWidth = N0.getValueType().getScalarSizeInBits(); - - KnownZero = KnownOne = APInt(InBitWidth, 0); - APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); - KnownOne = KnownOne.zext(BitWidth); - KnownZero = KnownZero.zext(BitWidth); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth); + unsigned NumElts = VT.getVectorNumElements(); + + EVT SrcVT = N0.getValueType(); + unsigned InNumElts = SrcVT.getVectorNumElements(); + unsigned InBitWidth = SrcVT.getScalarSizeInBits(); + assert(InNumElts >= NumElts && "Illegal VZEXT input"); + + Known = KnownBits(InBitWidth); + APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); + DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); + Known = Known.zext(BitWidth); + Known.Zero.setBitsFrom(InBitWidth); break; } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, const SelectionDAG &DAG, unsigned Depth) const { - // SETCC_CARRY sets the dest to ~0 for true or 0 for false. - if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getScalarValueSizeInBits(); + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { + unsigned VTBits = Op.getScalarValueSizeInBits(); + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case X86ISD::SETCC_CARRY: + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + return VTBits; - if (Op.getOpcode() == X86ISD::VSEXT) { - EVT VT = Op.getValueType(); - EVT SrcVT = Op.getOperand(0).getValueType(); - unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits(); + case X86ISD::VSEXT: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + Tmp += VTBits - Src.getScalarValueSizeInBits(); return Tmp; } + case X86ISD::VSHLI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + if (ShiftVal.uge(VTBits)) + return VTBits; // Shifted all bits out --> zero. + if (ShiftVal.uge(Tmp)) + return 1; // Shifted all sign bits out --> unknown. + return Tmp - ShiftVal.getZExtValue(); + } + + case X86ISD::VSRAI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + ShiftVal += Tmp; + return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); + } + + case X86ISD::PCMPGT: + case X86ISD::PCMPEQ: + case X86ISD::CMPP: + case X86ISD::VPCOM: + case X86ISD::VPCOMU: + // Vector compares return zero/all-bits result values. + return VTBits; + } + // Fallback case. return 1; } @@ -26228,24 +27097,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). - if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && - isUndefOrEqual(Mask[0], 0) && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - - // Match against a VZEXT instruction. - // TODO: Add 256/512-bit vector support. - if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { + // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool Match = true; @@ -26255,19 +27117,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } if (Match) { - SrcVT = MaskVT; + unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); + SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize); + if (SrcVT != MaskVT) + V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = X86ISD::VZEXT; + Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT) + : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } } + // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + isUndefOrEqual(Mask[0], 0) && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; @@ -26285,7 +27160,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is256BitVector() && FloatDomain) { + if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; @@ -26304,7 +27179,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is512BitVector() && FloatDomain) { + if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { @@ -26343,47 +27218,78 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + const APInt &Zeroable, + bool AllowFloatDomain, + bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + unsigned InputSizeInBits = MaskVT.getSizeInBits(); + unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; + MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); - bool ContainsZeros = false; - SmallBitVector Zeroable(NumMaskElts, false); - for (unsigned i = 0; i != NumMaskElts; ++i) { - int M = Mask[i]; - Zeroable[i] = isUndefOrZero(M); - ContainsZeros |= (M == SM_SentinelZero); - } + bool ContainsZeros = + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); - // Attempt to match against byte/bit shifts. - // FIXME: Add 512-bit support. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskVT.getScalarSizeInBits(), Mask, - 0, Zeroable, Subtarget); - if (0 < ShiftAmt) { - PermuteImm = (unsigned)ShiftAmt; + // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. + if (!ContainsZeros && MaskScalarSizeInBits == 64) { + // Check for lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { + // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). + if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); + PermuteImm = getV4X86ShuffleImm(Mask); + return true; + } + if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { + SmallVector<int, 4> RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); + PermuteImm = getV4X86ShuffleImm(RepeatedMask); + return true; + } + } + } else if (AllowFloatDomain && Subtarget.hasAVX()) { + // VPERMILPD can permute with a non-repeating shuffle. + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } return true; } } - // Ensure we don't contain any zero elements. - if (ContainsZeros) - return false; - - assert(llvm::all_of(Mask, [&](int M) { - return SM_SentinelUndef <= M && M < (int)NumMaskElts; - }) && "Expected unary shuffle"); - - unsigned InputSizeInBits = MaskVT.getSizeInBits(); - unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size(); - MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + scaleShuffleMask(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } + } - // Handle PSHUFLW/PSHUFHW repeated patterns. - if (MaskScalarSizeInBits == 16) { + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef<int> LoMask(Mask.data() + 0, 4); @@ -26411,110 +27317,59 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } - - return false; } - return false; } - // We only support permutation of 32/64 bit elements after this. - if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) - return false; - - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if (FloatDomain && !Subtarget.hasAVX()) - return false; - - // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) - FloatDomain = true; - - // Check for lane crossing permutes. - if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { - // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). - if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); - PermuteImm = getV4X86ShuffleImm(Mask); + // Attempt to match against byte/bit shifts. + // FIXME: Add 512-bit support. + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, + MaskScalarSizeInBits, Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt) { + PermuteImm = (unsigned)ShiftAmt; return true; } - if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) { - SmallVector<int, 4> RepeatedMask; - if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); - PermuteImm = getV4X86ShuffleImm(RepeatedMask); - return true; - } - } - return false; - } - - // VPERMILPD can permute with a non-repeating shuffle. - if (FloatDomain && MaskScalarSizeInBits == 64) { - Shuffle = X86ISD::VPERMILPI; - ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); - PermuteImm = 0; - for (int i = 0, e = Mask.size(); i != e; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); - PermuteImm |= (M & 1) << i; - } - return true; } - // We need a repeating shuffle mask for VPERMILPS/PSHUFD. - SmallVector<int, 4> RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) - return false; - - // Narrow the repeated mask for 32-bit element permutes. - SmallVector<int, 4> WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - scaleShuffleMask(2, RepeatedMask, WordMask); - - Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; + return false; } // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, SDValue &V1, SDValue &V2, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; ShuffleVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; ShuffleVT = MaskVT; return true; @@ -26527,57 +27382,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { - MVT LegalVT = MaskVT; - if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) - LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); - - SmallVector<int, 64> Unpckl, Unpckh; - if (IsUnary) { - createUnpackShuffleMask(MaskVT, Unpckl, true, true); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, true); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - } else { - createUnpackShuffleMask(MaskVT, Unpckl, true, false); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, false); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } + if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, + DAG, Subtarget)) { + ShuffleVT = MaskVT; + if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2()) + ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + return true; } } @@ -26585,17 +27395,20 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, - SDValue &V1, SDValue &V2, - SDLoc &DL, SelectionDAG &DAG, + const APInt &Zeroable, + bool AllowFloatDomain, + bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against PALIGNR byte rotate. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; @@ -26606,77 +27419,69 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to X86ISD::BLENDI. - if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || - (Subtarget.hasAVX() && MaskVT.is256BitVector()))) { - // Determine a type compatible with X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - MVT BlendVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (BlendVT == MVT::v4i64) - BlendVT = MVT::v8i32; - else if (BlendVT == MVT::v2i64) - BlendVT = MVT::v4i32; - } else { - if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32) - BlendVT = MVT::v8i16; - else if (BlendVT == MVT::v4i64) - BlendVT = MVT::v4f64; - else if (BlendVT == MVT::v8i32) - BlendVT = MVT::v8f32; - } - - unsigned BlendSize = BlendVT.getVectorNumElements(); - unsigned MaskRatio = BlendSize / NumMaskElts; - - // Can we blend with zero? - if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, - /*Low*/ 0) && - NumMaskElts <= BlendVT.getVectorNumElements()) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if (Mask[i / MaskRatio] < 0) - PermuteImm |= 1u << i; - - V2 = getZeroVector(BlendVT, Subtarget, DAG, DL); - Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; - return true; - } - - // Attempt to match as a binary blend. - if (NumMaskElts <= BlendVT.getVectorNumElements()) { - bool MatchBlend = true; - for (int i = 0; i != (int)NumMaskElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - else if (M == SM_SentinelZero) - MatchBlend = false; - else if ((M != i) && (M != (i + (int)NumMaskElts))) - MatchBlend = false; - } + if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || + (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || + (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, + BlendMask)) { + if (MaskVT == MVT::v16i16) { + // We can only use v16i16 PBLENDW if the lanes are repeated. + SmallVector<int, 8> RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, + RepeatedMask)) { + assert(RepeatedMask.size() == 8 && + "Repeated mask size doesn't match!"); + PermuteImm = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 8) + PermuteImm |= 1 << i; + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; + return true; + } + } else { + // Determine a type compatible with X86ISD::BLENDI. + ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } - if (MatchBlend) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if ((int)NumMaskElts <= Mask[i / MaskRatio]) - PermuteImm |= 1u << i; + if (!ShuffleVT.isFloatingPoint()) { + int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); + BlendMask = + scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); + ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + } + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; return true; } } } // Attempt to combine to INSERTPS. - if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) { - SmallBitVector Zeroable(4, false); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (Mask[i] < 0) - Zeroable[i] = true; - - if (Zeroable.any() && + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector()) { + if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; @@ -26685,22 +27490,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to SHUFPD. - if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) || - (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) || - (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 64 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. - if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || - (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || - (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 32 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector<int, 4> RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + // Match each half of the repeated mask, to determine if its just + // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; @@ -26732,7 +27541,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } @@ -26764,7 +27573,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); - SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1])); + SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) + : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); @@ -26853,6 +27663,18 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; + // Which shuffle domains are permitted? + // Permit domain crossing at higher combine depths. + bool AllowFloatDomain = FloatDomain || (Depth > 3); + bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && + (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); + + // Determine zeroable mask elements. + APInt Zeroable(NumMaskElts, 0); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (isUndefOrZero(Mask[i])) + Zeroable.setBit(i); + if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper @@ -26869,8 +27691,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26884,8 +27707,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, - Shuffle, ShuffleVT, PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, + ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26901,8 +27725,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget, - Shuffle, ShuffleVT, UnaryShuffle)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, + UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26918,8 +27743,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, - DAG, Subtarget, Shuffle, ShuffleVT, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, V1, V2, DL, DAG, + Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! @@ -26937,6 +27763,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } + // Typically from here on, we need an integer version of MaskVT. + MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); + IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); + + // Annoyingly, SSE4A instructions don't map into the above match helpers. + if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(IntMaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) @@ -26957,9 +27822,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -26988,9 +27851,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27015,9 +27876,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); @@ -27039,12 +27898,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); - SmallBitVector UndefElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); SmallVector<APInt, 64> EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) @@ -27076,8 +27935,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } - MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); - SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27100,8 +27958,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector<int, 8> VPerm2Idx; - MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits()); - MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts); unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { @@ -27121,7 +27977,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); - SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true); + SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPerm2MaskOp.getNode()); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); @@ -27228,8 +28084,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, // Extract constant bits from each source op. bool OneUseConstantOp = false; - SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps); - SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps); + SmallVector<APInt, 16> UndefEltsOps(NumOps); + SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps); for (unsigned i = 0; i != NumOps; ++i) { SDValue SrcOp = Ops[i]; OneUseConstantOp |= SrcOp.hasOneUse(); @@ -27245,18 +28101,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, return false; // Shuffle the constant bits according to the mask. - SmallBitVector UndefElts(NumMaskElts, false); - SmallBitVector ZeroElts(NumMaskElts, false); - SmallBitVector ConstantElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); + APInt ZeroElts(NumMaskElts, 0); + APInt ConstantElts(NumMaskElts, 0); SmallVector<APInt, 8> ConstantBitData(NumMaskElts, APInt::getNullValue(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); @@ -27266,21 +28122,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } - ConstantElts[i] = true; + ConstantElts.setBit(i); ConstantBitData[i] = Bits; } - assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); // Create the constant data. MVT MaskSVT; @@ -27330,6 +28186,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, ArrayRef<int> RootMask, + ArrayRef<const SDNode*> SrcNodes, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -27353,13 +28210,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. - SDValue Input0, Input1; - SmallVector<int, 16> OpMask; - if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) + SmallVector<int, 64> OpMask; + SmallVector<SDValue, 2> OpInputs; + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return false; + assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); + SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); + SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); + // Add the inputs to the Ops list, avoiding duplicates. - SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end()); + SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); int InputIdx0 = -1, InputIdx1 = -1; for (int i = 0, e = Ops.size(); i < e; ++i) { @@ -27385,52 +28246,70 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); - int MaskWidth = std::max<int>(OpMask.size(), RootMask.size()); - int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); - int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); - assert(((RootRatio == 1 && OpRatio == 1) || - (RootRatio == 1) != (OpRatio == 1)) && + + // This function can be performance-critical, so we rely on the power-of-2 + // knowledge that we have about the mask sizes to replace div/rem ops with + // bit-masks and shifts. + assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); + unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); + + unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); + unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); + unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); + assert((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"); - SmallVector<int, 16> Mask; - Mask.reserve(MaskWidth); + assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootRatioLog2 = countTrailingZeros(RootRatio); + unsigned OpRatioLog2 = countTrailingZeros(OpRatio); + + SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the // root mask to get us all the way to the root value arrangement. The reason // for this order is that we are recursing up the operation chain. - for (int i = 0; i < MaskWidth; ++i) { - int RootIdx = i / RootRatio; + for (unsigned i = 0; i < MaskWidth; ++i) { + unsigned RootIdx = i >> RootRatioLog2; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. - Mask.push_back(RootMask[RootIdx]); + Mask[i] = RootMask[RootIdx]; continue; } - int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + unsigned RootMaskedIdx = + RootRatio == 1 + ? RootMask[RootIdx] + : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); // Just insert the scaled root mask value if it references an input other // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { - Mask.push_back(RootMaskedIdx); + Mask[i] = RootMaskedIdx; continue; } - RootMaskedIdx %= MaskWidth; - - int OpIdx = RootMaskedIdx / OpRatio; + RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); + unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. - Mask.push_back(OpMask[OpIdx]); + Mask[i] = OpMask[OpIdx]; continue; } // Ok, we have non-zero lanes, map them through to one of the Op's inputs. - int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio; - OpMaskedIdx %= MaskWidth; + unsigned OpMaskedIdx = + OpRatio == 1 + ? OpMask[OpIdx] + : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); + OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); if (OpMask[OpIdx] < (int)OpMask.size()) { assert(0 <= InputIdx0 && "Unknown target shuffle input"); OpMaskedIdx += InputIdx0 * MaskWidth; @@ -27439,7 +28318,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, OpMaskedIdx += InputIdx1 * MaskWidth; } - Mask.push_back(OpMaskedIdx); + Mask[i] = OpMaskedIdx; } // Handle the all undef/zero cases early. @@ -27457,28 +28336,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, } // Remove unused shuffle source ops. - SmallVector<SDValue, 8> UsedOps; - for (int i = 0, e = Ops.size(); i < e; ++i) { - int lo = UsedOps.size() * MaskWidth; - int hi = lo + MaskWidth; - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedOps.push_back(Ops[i]); - continue; - } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; - } - assert(!UsedOps.empty() && "Shuffle with no inputs detected"); - Ops = UsedOps; + resolveTargetShuffleInputsAndMask(Ops, Mask); + assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into each shuffle source op (if it's a target shuffle). + // Update the list of shuffle nodes that have been combined so far. + SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), + SrcNodes.end()); + CombinedNodes.push_back(Op.getNode()); + + // See if we can recurse into each shuffle source op (if it's a target + // shuffle). The source op should only be combined if it either has a + // single use (i.e. current Op) or all its users have already been combined. for (int i = 0, e = Ops.size(); i < e; ++i) - if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) - if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) + if (Ops[i].getNode()->hasOneUse() || + SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes, + Depth + 1, HasVariableMask, DAG, DCI, + Subtarget)) return true; // Attempt to constant fold all of the constant source ops. @@ -27495,7 +28371,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - SmallVector<int, 16> WidenedMask; + SmallVector<int, 64> WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); } @@ -27561,8 +28437,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); @@ -27842,19 +28717,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } case X86ISD::MOVSD: case X86ISD::MOVSS: { - bool isFloat = VT.isFloatingPoint(); SDValue V0 = peekThroughBitcasts(N->getOperand(0)); SDValue V1 = peekThroughBitcasts(N->getOperand(1)); - bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); - bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); - assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + if (isZero0 && isZero1) + return SDValue(); // We often lower to MOVSD/MOVSS from integer as well as native float // types; remove unnecessary domain-crossing bitcasts if we can to make it // easier to combine shuffles later on. We've already accounted for the // domain switching cost when we decided to lower with it. + bool isFloat = VT.isFloatingPoint(); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); @@ -28025,7 +28901,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) return NewN; break; @@ -28173,12 +29049,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); - - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) - return SDValue(); - // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) @@ -28249,11 +29120,19 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) - Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + Elts.clear(); + break; + } - if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) - return LD; + if (Elts.size() == VT.getVectorNumElements()) + if (SDValue LD = + EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle <mask> (concat_vectors t1, undef) @@ -28276,7 +29155,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -28303,18 +29182,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EVT OriginalVT = InVec.getValueType(); - if (InVec.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || - BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) - return SDValue(); - InVec = InVec.getOperand(0); - } + // Peek through bitcasts, don't duplicate a load with other uses. + InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); + if (!CurrentVT.isVector() || + CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -28389,23 +29263,151 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } +// Try to match patterns such as +// (i16 bitcast (v16i1 x)) +// -> +// (i16 movmsk (16i8 sext (v16i1 x))) +// before the illegal vector is scalarized on subtargets that don't have legal +// vxi1 types. +static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, + const X86Subtarget &Subtarget) { + EVT VT = BitCast.getValueType(); + SDValue N0 = BitCast.getOperand(0); + EVT VecVT = N0->getValueType(0); + + if (!VT.isScalarInteger() || !VecVT.isSimple()) + return SDValue(); + + // With AVX512 vxi1 types are legal and we prefer using k-regs. + // MOVMSK is supported in SSE2 or later. + if (Subtarget.hasAVX512() || !Subtarget.hasSSE2()) + return SDValue(); + + // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and + // v8f64. So all legal 128-bit and 256-bit vectors are covered except for + // v8i16 and v16i16. + // For these two cases, we can shuffle the upper element bytes to a + // consecutive sequence at the start of the vector and treat the results as + // v16i8 or v32i8, and for v61i8 this is the preferable solution. However, + // for v16i16 this is not the case, because the shuffle is expensive, so we + // avoid sign-extending to this type entirely. + // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: + // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) + MVT SExtVT; + MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + switch (VecVT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v2i1: + SExtVT = MVT::v2i64; + FPCastVT = MVT::v2f64; + break; + case MVT::v4i1: + SExtVT = MVT::v4i32; + FPCastVT = MVT::v4f32; + // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) + // sign-extend to a 256-bit operation to avoid truncation. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v4i64; + FPCastVT = MVT::v4f64; + } + break; + case MVT::v8i1: + SExtVT = MVT::v8i16; + // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), + // sign-extend to a 256-bit operation to match the compare. + // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over + // 256-bit because the shuffle is cheaper than sign extending the result of + // the compare. + if (N0->getOpcode() == ISD::SETCC && + N0->getOperand(0)->getValueType(0).is256BitVector() && + Subtarget.hasInt256()) { + SExtVT = MVT::v8i32; + FPCastVT = MVT::v8f32; + } + break; + case MVT::v16i1: + SExtVT = MVT::v16i8; + // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), + // it is not profitable to sign-extend to 256-bit because this will + // require an extra cross-lane shuffle which is more expensive than + // truncating the result of the compare to 128-bits. + break; + case MVT::v32i1: + // TODO: Handle pre-AVX2 cases by splitting to two v16i1's. + if (!Subtarget.hasInt256()) + return SDValue(); + SExtVT = MVT::v32i8; + break; + }; + + SDLoc DL(BitCast); + SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); + if (SExtVT == MVT::v8i16) { + V = DAG.getBitcast(MVT::v16i8, V); + V = DAG.getVectorShuffle( + MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), + {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + } else + assert(SExtVT.getScalarType() != MVT::i16 && + "Vectors of i16 must be shuffled"); + if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + V = DAG.getBitcast(FPCastVT, V); + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getZExtOrTrunc(V, DL, VT); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + + // Try to match patterns such as + // (i16 bitcast (v16i1 x)) + // -> + // (i16 movmsk (16i8 sext (v16i1 x))) + // before the setcc result is scalarized on subtargets that don't have legal + // vxi1 types. + if (DCI.isBeforeLegalize()) + if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget)) + return V; + // Since MMX types are special and don't usually play with other vector types, + // it's better to handle them early to be sure we emit efficient code by + // avoiding store-load conversions. - // Detect bitcasts between i32 to x86mmx low word. Since MMX types are - // special and don't usually play with other vector types, it's better to - // handle them early to be sure we emit efficient code by avoiding - // store-load conversions. + // Detect bitcasts between i32 to x86mmx low word. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && - N0.getValueType() == MVT::v2i32 && - isNullConstant(N0.getOperand(1))) { + SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } + // Detect bitcasts between element or subvector extraction to x86mmx. + if (VT == MVT::x86mmx && + (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType().is128BitVector()) + return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, + DAG.getBitcast(MVT::v2i64, N00)); + } + + // Detect bitcasts from FP_TO_SINT to x86mmx. + if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 && + N0.getOpcode() == ISD::FP_TO_SINT) { + SDLoc DL(N0); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, + DAG.getBitcast(MVT::v2i64, Res)); + } + // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the @@ -28511,12 +29513,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) + if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); + // The following instructions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && @@ -28529,8 +29537,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOperand(0) != SelectOp1) return false; - // The second operand of the comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal, + /*AllowShrink*/false) && + SplatVal.isOneValue()) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; @@ -28576,17 +29594,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE2 or with AVX512VL (which uses predicate registers). + if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match) + continue; + + // EXTRACT_VECTOR_ELT can require implicit extension of the vector element + // which we can't support here for now. + if (Match.getScalarValueSizeInBits() != BitWidth) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + return SDValue(); + + // Don't bother performing this for 2-element vectors. + if (Match.getValueType().getVectorNumElements() <= 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) + MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + // Perform the select as i32/i64 and then truncate to avoid partial register + // stalls. + unsigned ResWidth = std::max(BitWidth, 32u); + EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); + SDLoc DL(Extract); + SDValue Zero = DAG.getConstant(0, DL, ResVT); + SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + Ones, Zero, CondCode); + return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is appropriate - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. + // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; @@ -28595,15 +29688,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, else if (Subtarget.hasAVX2()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + // The operand is expected to be zero extended from i8 + // (verified in detectZextAbsDiff). + // In order to convert to i64 and above, additional any/zero/sign + // extend is expected. + // The zero extend from 32 bit has no mathematical effect on the result. + // Also the sign extend is basically zero extend + // (extends the sign bit which is zero). + // So it is correct to skip the sign/zero extend instruction. + if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) + Root = Root.getOperand(0); + // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) @@ -28614,7 +29720,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); - // Create the SAD instruction + // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); @@ -28636,13 +29742,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, } } - // Return the lowest i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + MVT Type = Extract->getSimpleValueType(0); + unsigned TypeSizeInBits = Type.getSizeInBits(); + // Return the lowest TypeSizeInBits bits. + MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } +// Attempt to peek through a target shuffle and extract the scalar from the +// source. +static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Src = N->getOperand(0); + SDValue Idx = N->getOperand(1); + + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + + // Don't attempt this for boolean mask vectors or unknown extraction indices. + if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Resolve the target shuffle inputs and mask. + SmallVector<int, 16> Mask; + SmallVector<SDValue, 2> Ops; + if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) + return SDValue(); + + // Attempt to narrow/widen the shuffle mask to the correct size. + if (Mask.size() != NumSrcElts) { + if ((NumSrcElts % Mask.size()) == 0) { + SmallVector<int, 16> ScaledMask; + int Scale = NumSrcElts / Mask.size(); + scaleShuffleMask(Scale, Mask, ScaledMask); + Mask = std::move(ScaledMask); + } else if ((Mask.size() % NumSrcElts) == 0) { + SmallVector<int, 16> WidenedMask; + while (Mask.size() > NumSrcElts && + canWidenShuffleElements(Mask, WidenedMask)) + Mask = std::move(WidenedMask); + // TODO - investigate support for wider shuffle masks with known upper + // undef/zero elements for implicit zero-extension. + } + } + + // Check if narrowing/widening failed. + if (Mask.size() != NumSrcElts) + return SDValue(); + + int SrcIdx = Mask[N->getConstantOperandVal(1)]; + SDLoc dl(N); + + // If the shuffle source element is undef/zero then we can just accept it. + if (SrcIdx == SM_SentinelUndef) + return DAG.getUNDEF(VT); + + if (SrcIdx == SM_SentinelZero) + return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) + : DAG.getConstant(0, dl, VT); + + SDValue SrcOp = Ops[SrcIdx / Mask.size()]; + SrcOp = DAG.getBitcast(SrcVT, SrcOp); + SrcIdx = SrcIdx % Mask.size(); + + // We can only extract other elements from 128-bit vectors and in certain + // circumstances, depending on SSE-level. + // TODO: Investigate using extract_subvector for larger vectors. + // TODO: Investigate float/double extraction if it will be just stored. + if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && + ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { + assert(SrcSVT == VT && "Unexpected extraction type"); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + } + + if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || + (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { + assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && + "Unexpected extraction type"); + unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp, + DAG.getValueType(SrcSVT)); + return DAG.getZExtOrTrunc(Assert, dl, VT); + } + + return SDValue(); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -28653,14 +29849,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); SDLoc dl(InputVector); + + // Detect mmx extraction of all bits as a i64. It works better as a bitcast. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { + SDValue MMXSrc = InputVector.getOperand(0); + + // The bitcast source is a direct mmx result. + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getBitcast(VT, InputVector); + } + // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && - N->getValueType(0) == MVT::i32 && - InputVector.getValueType() == MVT::v2i32 && - isa<ConstantSDNode>(N->getOperand(1)) && - N->getConstantOperandVal(1) == 0) { + VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. @@ -28668,15 +29879,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - EVT VT = N->getValueType(0); - - if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && - InputVector.getOpcode() == ISD::BITCAST && + if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && + isa<ConstantSDNode>(EltIdx) && isa<ConstantSDNode>(InputVector.getOperand(0))) { - uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + uint64_t ExtractedElt = N->getConstantOperandVal(1); + uint64_t InputValue = InputVector.getConstantOperandVal(0); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -28687,9 +29894,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. - if (InputVector.getValueType() != MVT::v4i32) + if (SrcVT != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a @@ -28717,9 +29928,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); // Record which element was extracted. - ExtractedElements |= - 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); - + ExtractedElements |= 1 << Extract->getConstantOperandVal(1); Uses.push_back(Extract); } @@ -28752,11 +29961,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue StackPtr = DAG.CreateStackTemporary(SrcVT); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo()); - EVT ElementType = InputVector.getValueType().getVectorElementType(); + EVT ElementType = SrcVT.getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. @@ -28779,8 +29988,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - SDValue Idx = Extract->getOperand(1); - uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + uint64_t IdxVal = Extract->getConstantOperandVal(1); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } @@ -28788,6 +29996,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// TODO - merge with combineExtractVectorElt once it can handle the implicit +// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: +// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and +// combineBasicSADPattern. +static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + return combineExtractWithShuffle(N, DAG, DCI, Subtarget); +} + /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. static SDValue @@ -28812,12 +30030,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // This situation only applies to avx512. if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { - //Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), - DL, CondVT)); - //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + // Invert the cond to not(cond) : xor(op,allones)=not(op) + SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, + DAG.getAllOnesConstant(DL, CondVT)); + // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 + return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -28920,18 +30137,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(ShAmt, DL, MVT::i8)); } - // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. - if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); - return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - } - // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { @@ -28939,7 +30144,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; - bool isFastMultiplier = false; + bool IsFastMultiplier = false; if (Diff < 10) { switch ((unsigned char)Diff) { default: @@ -28951,12 +30156,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) - isFastMultiplier = true; + IsFastMultiplier = true; break; } } - if (isFastMultiplier) { + if (IsFastMultiplier) { APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); if (NeedsCondInvert) // Invert the condition if needed. Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, @@ -29049,7 +30254,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29063,7 +30268,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op1.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, Op1, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); return true; } case ISD::EXTRACT_SUBVECTOR: { @@ -29072,7 +30277,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29084,7 +30289,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op0.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); + return true; + } + case X86ISD::SUBV_BROADCAST: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = MVT::getVectorVT(EltVT, + Op0.getSimpleValueType().getSizeInBits() / EltSize); + Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0)); + DCI.AddToWorklist(Op0.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0)); return true; } } @@ -29147,6 +30368,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -29177,6 +30399,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -29211,6 +30434,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: @@ -29239,6 +30463,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); + LLVM_FALLTHROUGH; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: @@ -29296,7 +30521,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } @@ -29355,7 +30580,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // x s< 0 ? x^C : 0 --> subus x, C if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && ISD::isBuildVectorAllZeros(CondRHS.getNode()) && - OpRHSConst->getAPIntValue().isSignBit()) + OpRHSConst->getAPIntValue().isSignMask()) // Note that we have to rebuild the RHS constant here to ensure we // don't rely on particular values of undef lanes. return DAG.getNode( @@ -29370,8 +30595,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the - // condition so that the blends can use the high bit of each element and use - // SimplifyDemandedBits to simplify the condition operand. + // condition so that blends can use the high (sign) bit of each element and + // use SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { @@ -29404,51 +30629,49 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Byte blends are only available in AVX2 if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) return SDValue(); + // There are no 512-bit blend instructions that use sign bits. + if (VT.is512BitVector()) + return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); - APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), - DCI.isBeforeLegalizeOps()); - if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, - TLO)) { - // If we changed the computation somewhere in the DAG, this change - // will affect all users of Cond. - // Make sure it is fine and update all the nodes so that we do not - // use the generic VSELECT anymore. Otherwise, we may perform - // wrong optimizations as we messed up with the actual expectation + APInt DemandedMask(APInt::getSignMask(BitWidth)); + KnownBits Known; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) { + // If we changed the computation somewhere in the DAG, this change will + // affect all users of Cond. Make sure it is fine and update all the nodes + // so that we do not use the generic VSELECT anymore. Otherwise, we may + // perform wrong optimizations as we messed with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are - // set properly. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. + // Check all uses of the condition operand to check whether it will be + // consumed by non-BLEND instructions. Those may require that all bits + // are set properly. + for (SDNode *U : Cond->uses()) { + // TODO: Add other opcodes eventually lowered into BLEND. + if (U->getOpcode() != ISD::VSELECT) return SDValue(); + } - // Update all the users of the condition, before committing the change, - // so that the VSELECT optimizations that expect the correct vector - // boolean value will not be triggered. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - DAG.ReplaceAllUsesOfValueWith( - SDValue(*I, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), - Cond, I->getOperand(1), I->getOperand(2))); + // Update all users of the condition before committing the change, so + // that the VSELECT optimizations that expect the correct vector boolean + // value will not be triggered. + for (SDNode *U : Cond->uses()) { + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), + U->getValueType(0), Cond, U->getOperand(1), + U->getOperand(2)); + DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); + } DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } - // At this point, only Cond is changed. Change the condition - // just for N to keep the opportunity to optimize all other - // users their own way. - DAG.ReplaceAllUsesOfValueWith( - SDValue(N, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), - TLO.New, N->getOperand(1), N->getOperand(2))); + // Only Cond (rather than other nodes in the computation chain) was + // changed. Change the condition just for N to keep the opportunity to + // optimize all other users their own way. + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB); return SDValue(); } } @@ -29456,7 +30679,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Look for vselects with LHS/RHS being bitcasted from an operation that // can be executed on another type. Push the bitcast to the inputs of // the operation. This exposes opportunities for using masking instructions. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() && + if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() && CondVT.getVectorElementType() == MVT::i1) { if (combineBitcastForMaskedOp(LHS, DAG, DCI)) return SDValue(N, 0); @@ -29464,6 +30687,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + // Custom action for SELECT MMX + if (VT == MVT::x86mmx) { + LHS = DAG.getBitcast(MVT::i64, LHS); + RHS = DAG.getBitcast(MVT::i64, RHS); + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS); + return DAG.getBitcast(VT, newSelect); + } + return SDValue(); } @@ -29711,11 +30942,40 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, return true; } +// When legalizing carry, we create carries via add X, -1 +// If that comes from an actual carry, via setcc, we use the +// carry directly. +static SDValue combineCarryThroughADD(SDValue EFLAGS) { + if (EFLAGS.getOpcode() == X86ISD::ADD) { + if (isAllOnesConstant(EFLAGS.getOperand(1))) { + SDValue Carry = EFLAGS.getOperand(0); + while (Carry.getOpcode() == ISD::TRUNCATE || + Carry.getOpcode() == ISD::ZERO_EXTEND || + Carry.getOpcode() == ISD::SIGN_EXTEND || + Carry.getOpcode() == ISD::ANY_EXTEND || + (Carry.getOpcode() == ISD::AND && + isOneConstant(Carry.getOperand(1)))) + Carry = Carry.getOperand(0); + if (Carry.getOpcode() == X86ISD::SETCC || + Carry.getOpcode() == X86ISD::SETCC_CARRY) { + if (Carry.getConstantOperandVal(0) == X86::COND_B) + return Carry.getOperand(1); + } + } + } + + return SDValue(); +} + /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG) { + if (CC == X86::COND_B) + if (SDValue Flags = combineCarryThroughADD(EFLAGS)) + return Flags; + if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; return combineSetCCAtomicArith(EFLAGS, CC, DAG); @@ -30141,6 +31401,77 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, } } +static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, + EVT VT, SDLoc DL) { + + auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { + SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(Mult, DL, VT)); + Result = DAG.getNode(ISD::SHL, DL, VT, Result, + DAG.getConstant(Shift, DL, MVT::i8)); + Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, + N->getOperand(0)); + return Result; + }; + + auto combineMulMulAddOrSub = [&](bool isAdd) { + SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(9, DL, VT)); + Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT)); + Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, + N->getOperand(0)); + return Result; + }; + + switch (MulAmt) { + default: + break; + case 11: + // mul x, 11 => add ((shl (mul x, 5), 1), x) + return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); + case 21: + // mul x, 21 => add ((shl (mul x, 5), 2), x) + return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); + case 22: + // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); + case 19: + // mul x, 19 => sub ((shl (mul x, 5), 2), x) + return combineMulShlAddOrSub(5, 2, /*isAdd*/ false); + case 13: + // mul x, 13 => add ((shl (mul x, 3), 2), x) + return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); + case 23: + // mul x, 13 => sub ((shl (mul x, 3), 3), x) + return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); + case 14: + // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulShlAddOrSub(3, 2, /*isAdd*/ true)); + case 26: + // mul x, 26 => sub ((mul (mul x, 9), 3), x) + return combineMulMulAddOrSub(/*isAdd*/ false); + case 28: + // mul x, 28 => add ((mul (mul x, 9), 3), x) + return combineMulMulAddOrSub(/*isAdd*/ true); + case 29: + // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + combineMulMulAddOrSub(/*isAdd*/ true)); + case 30: + // mul x, 30 => sub (sub ((shl x, 5), x), x) + return DAG.getNode( + ISD::SUB, DL, VT, + DAG.getNode(ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(5, DL, MVT::i8)), + N->getOperand(0)), + N->getOperand(0)); + } + return SDValue(); +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -30150,6 +31481,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); + if (!MulConstantOptimization) + return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); @@ -30205,25 +31538,41 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); - } + } else if (!Subtarget.slowLEA()) + NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL); if (!NewMul) { - assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) - && "Both cases that could cause potential overflows should have " - "already been handled."); - if (isPowerOf2_64(MulAmt - 1)) - // (mul x, 2^N + 1) => (add (shl x, N), x) - NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt - 1), DL, - MVT::i8))); - - else if (isPowerOf2_64(MulAmt + 1)) - // (mul x, 2^N - 1) => (sub (shl x, N), x) - NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, - N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt + 1), - DL, MVT::i8)), N->getOperand(0)); + assert(MulAmt != 0 && + MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + "Both cases that could cause potential overflows should have " + "already been handled."); + int64_t SignMulAmt = C->getSExtValue(); + if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) && + (SignMulAmt != -INT64_MAX)) { + int NumSign = SignMulAmt > 0 ? 1 : -1; + bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1); + bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1); + if (IsPowerOf2_64PlusOne) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode( + ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL, + MVT::i8))); + } else if (IsPowerOf2_64MinusOne) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode( + ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL, + MVT::i8)), + N->getOperand(0)); + } + // To negate, subtract the number from zero + if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1) + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); + } } if (NewMul) @@ -30246,8 +31595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - const APInt &ShAmt = N1C->getAPIntValue(); - Mask = Mask.shl(ShAmt); + Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if // we carefully interrogate the mask to make sure we are semantics @@ -30396,42 +31744,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) && - "Unexpected opcode"); +static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || + X86ISD::VSRLI == Opcode) && + "Unexpected shift opcode"); + bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - - // This fails for mask register (vXi1) shifts. - if ((NumBitsPerElt % 8) != 0) - return SDValue(); + assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && + "Unexpected value type"); // Out of range logical bit shifts are guaranteed to be zero. - APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); - if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // Out of range arithmetic bit shifts splat the sign bit. + APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue(); + if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) { + if (LogicalShift) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + else + ShiftVal = NumBitsPerElt - 1; + } // Shift N0 by zero -> N0. if (!ShiftVal) - return N->getOperand(0); + return N0; // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). + // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. + // TODO - support other sra opcodes as needed. + if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && + N0.getOpcode() == X86ISD::VSRAI) + return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); + // We can decode 'whole byte' logical bit shifts as shuffles. - if ((ShiftVal.getZExtValue() % 8) == 0) { + if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } + // Constant Folding. + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (N->isOnlyUserOf(N0.getNode()) && + getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { + assert(EltBits.size() == VT.getVectorNumElements() && + "Unexpected shift value type"); + unsigned ShiftImm = ShiftVal.getZExtValue(); + for (APInt &Elt : EltBits) { + if (X86ISD::VSHLI == Opcode) + Elt <<= ShiftImm; + else if (X86ISD::VSRAI == Opcode) + Elt.ashrInPlace(ShiftImm); + else + Elt.lshrInPlace(ShiftImm); + } + return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); + } + + return SDValue(); +} + +static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert( + ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && + N->getValueType(0) == MVT::v8i16)) && + "Unexpected vector insertion"); + + // Attempt to combine PINSRB/PINSRW patterns to a shuffle. + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget); return SDValue(); } @@ -30495,13 +31896,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { - SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00, - CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); - if (N->getValueType(0) != MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), - FSetCC); - return FSetCC; + SDValue FSetCC = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), + FSetCC, DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, @@ -30550,33 +31949,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) return SDValue(); - // Canonicalize XOR to the left. - if (N1.getOpcode() == ISD::XOR) - std::swap(N0, N1); - - if (N0.getOpcode() != ISD::XOR) - return SDValue(); + if (N0.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); - - N01 = peekThroughBitcasts(N01); - - // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an - // insert_subvector building a 256-bit AllOnes vector. - if (!ISD::isBuildVectorAllOnes(N01.getNode())) { - if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR) - return SDValue(); + if (N1.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); - SDValue V1 = N01->getOperand(0); - SDValue V2 = N01->getOperand(1); - if (V1.getOpcode() != ISD::INSERT_SUBVECTOR || - !V1.getOperand(0).isUndef() || - !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) || - !ISD::isBuildVectorAllOnes(V2.getNode())) - return SDValue(); - } - return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1); + return SDValue(); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized @@ -30696,38 +32077,35 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is -/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to -/// eliminate loading the vector constant mask value. This relies on the fact -/// that a PCMP always creates an all-ones or all-zeros bitmask per element. -static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { +/// If this is a zero/all-bits result that is bitwise-anded with a low bits +/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' +/// with a shift-right to eliminate loading the vector constant mask value. +static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + EVT VT0 = Op0.getValueType(); + EVT VT1 = Op1.getValueType(); - // TODO: Use AssertSext to mark any nodes that have the property of producing - // all-ones or all-zeros. Then check for that node rather than particular - // opcodes. - if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) + if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger()) return SDValue(); - // The existence of the PCMP node guarantees that we have the required SSE2 or - // AVX2 for a shift of this vector type, but there is no vector shift by - // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the - // masked compare nodes, so they should not make it here. - EVT VT0 = Op0.getValueType(); - EVT VT1 = Op1.getValueType(); - unsigned EltBitWidth = VT0.getScalarSizeInBits(); - if (VT0 != VT1 || EltBitWidth == 8) + APInt SplatVal; + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal, + /*AllowShrink*/false) || + !SplatVal.isMask()) return SDValue(); - assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256); + if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) + return SDValue(); - APInt SplatVal; - if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) + unsigned EltBitWidth = VT0.getScalarSizeInBits(); + if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); - SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); + unsigned ShiftVal = SplatVal.countTrailingOnes(); + SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } @@ -30747,7 +32125,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; - if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) + if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; EVT VT = N->getValueType(0); @@ -30760,7 +32138,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -30802,20 +32180,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR); + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); - assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); + // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for + // ANDNP combine allows other combines to happen that prevent matching. if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); @@ -30837,21 +32217,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b unsigned EltBits = MaskVT.getScalarSizeInBits(); - unsigned SraAmt = ~0; - if (Mask.getOpcode() == ISD::SRA) { - if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) - if (auto *AmtConst = AmtBV->getConstantSplatNode()) - SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) { - SDValue SraC = Mask.getOperand(1); - SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - } - if ((SraAmt + 1) != EltBits) + + // TODO: Attempt to handle floating point cases as well? + if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); @@ -30872,7 +32241,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && + DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); @@ -30884,7 +32254,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, V = Y; if (V) { - assert(EltBits == 8 || EltBits == 16 || EltBits == 32); SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; @@ -30901,8 +32270,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (V == Y) std::swap(SubOp1, SubOp2); - return DAG.getBitcast(VT, - DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } } @@ -30915,7 +32284,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } @@ -30969,7 +32338,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && - N->getOperand(1).getConstantOperandVal(1) == 0 && + isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; @@ -31272,6 +32641,75 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } +/// Check if truncation with saturation form type \p SrcVT to \p DstVT +/// is valid for the given \p Subtarget. +static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + // FIXME: Scalar type may be supported if we move it to vector register. + if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) + return false; + + EVT SrcElVT = SrcVT.getScalarType(); + EVT DstElVT = DstVT.getScalarType(); + if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) + return false; + if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32) + return false; + if (SrcVT.is512BitVector() || Subtarget.hasVLX()) + return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); + return false; +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectUSatPattern(SDValue In, EVT VT) { + if (In.getOpcode() != ISD::UMIN) + return SDValue(); + + //Saturation with truncation. We truncate from InVT to VT. + assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && + "Unexpected types for truncate operation"); + + APInt C; + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C, + /*AllowShrink*/false)) { + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : + SDValue(); + } + return SDValue(); +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// The types should allow to use VPMOVUS* instruction on AVX512. +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, + const X86Subtarget &Subtarget) { + if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return SDValue(); + return detectUSatPattern(In, VT); +} + +static SDValue +combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT)) + return SDValue(); + if (auto USatVal = detectUSatPattern(In, VT)) + if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + return SDValue(); +} + /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. @@ -31327,8 +32765,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); if (!BV || !BV->isConstant()) return false; - for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + for (SDValue Op : V->ops()) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); if (!C) return false; uint64_t Val = C->getZExtValue(); @@ -31403,15 +32841,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation - // into two 16-byte operations. + // into two 16-byte operations. Also split non-temporal aligned loads on + // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && - TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, - AddressSpace, Alignment, &Fast) && !Fast) { + ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || + (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -31664,7 +33104,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -31838,6 +33278,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); + if (SDValue Val = + detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -31975,7 +33421,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); - SDValue NewChain = NewLd.getValue(1); + // Make sure new load is placed in same chain order. + SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd); if (TokenFactorIndex >= 0) { Ops.push_back(NewChain); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); @@ -31996,11 +33443,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, Ld->getPointerInfo().getWithOffset(4), MinAlign(Ld->getAlignment(), 4), Ld->getMemOperand()->getFlags()); + // Make sure new loads are placed in same chain order. + SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd); + NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd); - SDValue NewChain = LoLd.getValue(1); if (TokenFactorIndex >= 0) { - Ops.push_back(LoLd); - Ops.push_back(HiLd); + Ops.push_back(NewChain); NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); } @@ -32198,13 +33646,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); - auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) { - // TODO: Add extra cases where we can truncate both inputs for the - // cost of one (or none). - // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y ) + auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) { + unsigned TruncSizeInBits = VT.getScalarSizeInBits(); + + // Repeated operand, so we are only trading one output truncation for + // one input truncation. if (Op0 == Op1) return true; + // See if either operand has been extended from a smaller/equal size to + // the truncation size, allowing a truncation to combine with the extend. + unsigned Opcode0 = Op0.getOpcode(); + if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND || + Opcode0 == ISD::ZERO_EXTEND) && + Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + unsigned Opcode1 = Op1.getOpcode(); + if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND || + Opcode1 == ISD::ZERO_EXTEND) && + Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + // See if either operand is a single use constant which can be constant + // folded. SDValue BC0 = peekThroughOneUseBitcasts(Op0); SDValue BC1 = peekThroughOneUseBitcasts(Op1); return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || @@ -32236,7 +33701,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32252,7 +33717,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32458,6 +33923,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to combine truncation with unsigned saturation. + if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) + return Val; + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { @@ -32494,8 +33963,8 @@ static SDValue isFNEG(SDNode *N) { SDValue Op0 = peekThroughBitcasts(Op.getOperand(0)); unsigned EltBits = Op1.getScalarValueSizeInBits(); - auto isSignBitValue = [&](const ConstantFP *C) { - return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits); + auto isSignMask = [&](const ConstantFP *C) { + return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits); }; // There is more than one way to represent the same constant on @@ -32506,21 +33975,21 @@ static SDValue isFNEG(SDNode *N) { // We check all variants here. if (Op1.getOpcode() == X86ISD::VBROADCAST) { if (auto *C = getTargetConstantFromNode(Op1.getOperand(0))) - if (isSignBitValue(cast<ConstantFP>(C))) + if (isSignMask(cast<ConstantFP>(C))) return Op0; } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) { if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode()) - if (isSignBitValue(CN->getConstantFPValue())) + if (isSignMask(CN->getConstantFPValue())) return Op0; } else if (auto *C = getTargetConstantFromNode(Op1)) { if (C->getType()->isVectorTy()) { if (auto *SplatV = C->getSplatValue()) - if (isSignBitValue(cast<ConstantFP>(SplatV))) + if (isSignMask(cast<ConstantFP>(SplatV))) return Op0; } else if (auto *FPConst = dyn_cast<ConstantFP>(C)) - if (isSignBitValue(FPConst)) + if (isSignMask(FPConst)) return Op0; } return SDValue(); @@ -32545,7 +34014,7 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && - Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) { + Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); @@ -32800,8 +34269,35 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. - auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); + return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); +} + +/// Do target-specific dag combines on X86ISD::ANDNP nodes. +static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + // ANDNP(0, x) -> x + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return N->getOperand(1); + + // ANDNP(x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) + return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); + + EVT VT = N->getValueType(0); + + // Attempt to recursively combine a bitmask ANDNP with shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + + return SDValue(); } static SDValue combineBT(SDNode *N, SelectionDAG &DAG, @@ -32811,12 +34307,12 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, if (Op1.hasOneUse()) { unsigned BitWidth = Op1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); - APInt KnownZero, KnownOne; + KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || - TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) + if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) || + TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) DCI.CommitTargetLoweringOpt(TLO); } return SDValue(); @@ -32878,8 +34374,8 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, return SDValue(); bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; - bool NSW = Add->getFlags()->hasNoSignedWrap(); - bool NUW = Add->getFlags()->hasNoUnsignedWrap(); + bool NSW = Add->getFlags().hasNoSignedWrap(); + bool NUW = Add->getFlags().hasNoUnsignedWrap(); // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding // into the 'zext' @@ -32919,7 +34415,7 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, SDNodeFlags Flags; Flags.setNoSignedWrap(NSW); Flags.setNoUnsignedWrap(NUW); - return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags); } /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> @@ -33065,13 +34561,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); - return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); + return DAG.getSelect(DL, VT, N0, AllOnes, Zero); } return SDValue(); } + if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && + isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { + // Invert and sign-extend a boolean is the same as zero-extend and subtract + // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently + // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1. + // sext (xor Bool, -1) --> sub (zext Bool), 1 + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); + } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -33212,8 +34717,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Optimize x == -y --> x+y == 0 -/// x != -y --> x+y != 0 +/// Try to map a 128-bit or larger integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); + + // We're looking for an oversized integer equality comparison, but ignore a + // comparison with zero because that gets special treatment in EmitTest(). + SDValue X = SetCC->getOperand(0); + SDValue Y = SetCC->getOperand(1); + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + return SDValue(); + + // TODO: Use PXOR + PTEST for SSE4.1 or later? + // TODO: Add support for AVX-512. + EVT VT = SetCC->getValueType(0); + SDLoc DL(SetCC); + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX2())) { + EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + + // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. + // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq + // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne + // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq + // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne + SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); + SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, + MVT::i32); + return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); + } + + return SDValue(); +} + static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); @@ -33222,21 +34766,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc DL(N); - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + if (CC == ISD::SETNE || CC == ISD::SETEQ) { + EVT OpVT = LHS.getValueType(); + // 0-x == y --> x+y == 0 + // 0-x != y --> x+y != 0 + if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && + LHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + // x == 0-y --> x+y == 0 + // x != 0-y --> x+y != 0 + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && + RHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } + if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) + return V; + } + if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { bool IsSEXT0 = @@ -33293,56 +34843,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of performSETCCCombine. It is to materialize "setb reg" -// as "sbb reg,reg", since it can be extended without zext and produces -// an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, - SelectionDAG &DAG, MVT VT) { - if (VT == MVT::i8) - return DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS), - DAG.getConstant(1, DL, VT)); - assert (VT == MVT::i1 && "Unexpected type for SECCC node"); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS)); -} - // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); - if (CC == X86::COND_A) { - // Try to convert COND_A into COND_B in an attempt to facilitate - // materializing "setb reg". - // - // Do not flip "e > c", where "c" is a constant, because Cmp instruction - // cannot take an immediate as its first operand. - // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), - EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); - } - } - - // Materialize "setb reg" as "sbb reg,reg", since it can be extended without - // a zext and produces an all-ones bit which is more useful than 0/1 in some - // cases. - if (CC == X86::COND_B) - return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) return getSETCC(CC, Flags, DL, DAG); @@ -33352,7 +34859,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); @@ -33512,6 +35018,18 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + MVT VT = N->getSimpleValueType(0); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, + N->getOperand(0), N->getOperand(1), + Flags); + } + + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -33535,48 +35053,237 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, Res1, CarryOut); } + if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) { + MVT VT = N->getSimpleValueType(0); + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, + N->getOperand(0), N->getOperand(1), + Flags); + } + return SDValue(); } -/// fold (add Y, (sete X, 0)) -> adc 0, Y -/// (add Y, (setne X, 0)) -> sbb -1, Y -/// (sub (sete X, 0), Y) -> sbb 0, Y -/// (sub (setne X, 0), Y) -> adc -1, Y -static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { +/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit +/// which is more useful than 0/1 in some cases. +static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) { SDLoc DL(N); + // "Condition code B" is also known as "the carry flag" (CF). + SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8); + SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS); + MVT VT = N->getSimpleValueType(0); + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT)); - // Look through ZExts. - SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); - if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) - return SDValue(); + assert(VT == MVT::i1 && "Unexpected type for SETCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB); +} + +/// If this is an add or subtract where one operand is produced by a cmp+setcc, +/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} +/// with CMP+{ADC, SBB}. +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { + bool IsSub = N->getOpcode() == ISD::SUB; + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + + // If this is an add, canonicalize a zext operand to the RHS. + // TODO: Incomplete? What if both sides are zexts? + if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && + Y.getOpcode() != ISD::ZERO_EXTEND) + std::swap(X, Y); - SDValue SetCC = Ext.getOperand(0); - if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + // Look through a one-use zext. + bool PeekedThroughZext = false; + if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { + Y = Y.getOperand(0); + PeekedThroughZext = true; + } + + // If this is an add, canonicalize a setcc operand to the RHS. + // TODO: Incomplete? What if both sides are setcc? + // TODO: Should we allow peeking through a zext of the other operand? + if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && + Y.getOpcode() != X86ISD::SETCC) + std::swap(X, Y); + + if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) return SDValue(); - X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + SDLoc DL(N); + EVT VT = N->getValueType(0); + X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); + + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + auto *ConstantX = dyn_cast<ConstantSDNode>(X); + if (ConstantX) { + if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) { + // This is a complicated way to get -1 or 0 from the carry flag: + // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax + // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + Y.getOperand(1)); + } + + if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) { + SDValue EFLAGS = Y->getOperand(1); + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + // Swap the operands of a SUB, and we have the same pattern as above. + // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB + // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + NewEFLAGS); + } + } + } + + if (CC == X86::COND_B) { + // X + SETB Z --> X + (mask SBB Z, Z) + // X - SETB Z --> X - (mask SBB Z, Z) + // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY? + SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + + if (CC == X86::COND_A) { + SDValue EFLAGS = Y->getOperand(1); + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + } + if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); - SDValue Cmp = SetCC.getOperand(1); + SDValue Cmp = Y.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); - SDValue CmpOp0 = Cmp.getOperand(0); - SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, - DAG.getConstant(1, DL, CmpOp0.getValueType())); + SDValue Z = Cmp.getOperand(0); + EVT ZVT = Z.getValueType(); + + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + if (ConstantX) { + // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with + // fake operands: + // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) + // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) + if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) || + (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) { + SDValue Zero = DAG.getConstant(0, DL, ZVT); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + } + + // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' + // with fake operands: + // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) + // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) + if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) || + (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) { + SDValue One = DAG.getConstant(1, DL, ZVT); + SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); + } + } + + // (cmp Z, 1) sets the carry flag if Z is 0. + SDValue One = DAG.getConstant(1, DL, ZVT); + SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + + // Add the flags type for ADC/SBB nodes. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); - SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) + // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), - NewCmp); - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, + DAG.getConstant(-1ULL, DL, VT), Cmp1); + + // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) + // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, + DAG.getConstant(0, DL, VT), Cmp1); +} + +static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue MulOp = N->getOperand(0); + SDValue Phi = N->getOperand(1); + + if (MulOp.getOpcode() != ISD::MUL) + std::swap(MulOp, Phi); + if (MulOp.getOpcode() != ISD::MUL) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16) + return SDValue(); + + EVT VT = N->getValueType(0); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + unsigned VectorSize = VT.getVectorNumElements() * 16; + // If the vector size is less than 128, or greater than the supported RegSize, + // do not use PMADD. + if (VectorSize < 128 || VectorSize > RegSize) + return SDValue(); + + SDLoc DL(N); + EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements()); + EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + VT.getVectorNumElements() / 2); + + // Shrink the operands of mul. + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); + + // Madd vector size is half of the original vector size + SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1); + // Fill the rest of the output with 0 + SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); + return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, @@ -33650,12 +35357,41 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); } +/// Convert vector increment or decrement to sub/add with an all-ones constant: +/// add X, <1, 1...> --> sub X, <-1, -1...> +/// sub X, <1, 1...> --> add X, <-1, -1...> +/// The all-ones vector constant can be materialized using a pcmpeq instruction +/// that is commonly recognized as an idiom (has no register dependency), so +/// that's better/smaller than loading a splat 1 constant. +static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { + assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && + "Unexpected opcode for increment/decrement transform"); + + // Pseudo-legality check: getOnesVector() expects one of these types, so bail + // out and wait for legalization if we have an unsupported vector length. + EVT VT = N->getValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + SDNode *N1 = N->getOperand(1).getNode(); + APInt SplatVal; + if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) || + !SplatVal.isOneValue()) + return SDValue(); + + SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); + unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; + return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); +} + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags; - if (Flags->hasVectorReduction()) { + const SDNodeFlags Flags = N->getFlags(); + if (Flags.hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; + if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) + return MAdd; } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); @@ -33667,7 +35403,10 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + if (SDValue V = combineIncDecVector(N, DAG)) + return V; + + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -33700,36 +35439,47 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + if (SDValue V = combineIncDecVector(N, DAG)) + return V; + + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + SDLoc DL(N); unsigned Opcode = N->getOpcode(); MVT VT = N->getSimpleValueType(0); MVT SVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = SVT.getSizeInBits(); + SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); - unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); + unsigned InputBits = OpEltSizeInBits * NumElts; // Perform any constant folding. // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - unsigned NumDstElts = VT.getVectorNumElements(); - SmallBitVector Undefs(NumDstElts, false); - SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0)); - for (unsigned i = 0; i != NumDstElts; ++i) { - SDValue OpElt = Op.getOperand(i); - if (OpElt.getOpcode() == ISD::UNDEF) { - Undefs[i] = true; + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { + APInt Undefs(NumElts, 0); + SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0)); + bool IsZEXT = + (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + Undefs.setBit(i); continue; } - APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); - Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits()) - : Cst.sextOrTrunc(SVT.getSizeInBits()); + Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) + : EltBits[i].sextOrTrunc(EltSizeInBits); } return getConstVector(Vals, Undefs, VT, DAG, DL); } @@ -33829,7 +35579,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); if (N->getOpcode() == X86ISD::PCMPGT) return getZeroVector(VT, Subtarget, DAG, DL); } @@ -33837,6 +35587,99 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = N->getSimpleValueType(0); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // If this is an insert of an extract, combine to a shuffle. Don't do this + // if the insert or extract can be represented with a subvector operation. + if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && + SubVec.getOperand(0).getSimpleValueType() == OpVT && + (IdxVal != 0 || !Vec.isUndef())) { + int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue(); + if (ExtIdxVal != 0) { + int VecNumElts = OpVT.getVectorNumElements(); + int SubVecNumElts = SubVecVT.getVectorNumElements(); + SmallVector<int, 64> Mask(VecNumElts); + // First create an identity shuffle mask. + for (int i = 0; i != VecNumElts; ++i) + Mask[i] = i; + // Now insert the extracted portion. + for (int i = 0; i != SubVecNumElts; ++i) + Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; + + return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); + } + } + + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte + // load: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr + 16), Elts/2) + // --> load32 addr + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr + 32), Elts/2) + // --> load64 addr + // or a 16-byte or 32-byte broadcast: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr), Elts/2) + // --> X86SubVBroadcast(load16 addr) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr), Elts/2) + // --> X86SubVBroadcast(load32 addr) + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = {SubVec2, SubVec}; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, + Subtarget, false)) + return Ld; + } + } + // If lower/upper loads are the same and the only users of the load, then + // lower to a VBROADCASTF128/VBROADCASTI128/etc. + if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { + if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && + SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); + } + } + // If this is subv_broadcast insert into both halves, use a larger + // subv_broadcast. + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, + SubVec.getOperand(0)); + } + } + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -33845,13 +35688,19 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case X86ISD::PEXTRW: + case X86ISD::PEXTRB: + return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + case ISD::INSERT_SUBVECTOR: + return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget); + case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, Subtarget); case ISD::SUB: return combineSub(N, DAG, Subtarget); + case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: @@ -33870,6 +35719,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: @@ -33884,14 +35734,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); - case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); - case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); + case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); + case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::VSHLI: - case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); + case X86ISD::VSRAI: + case X86ISD::VSRLI: + return combineVectorShiftImm(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + case X86ISD::PINSRB: + case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -33969,14 +35827,21 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { /// know that the code that lowers COPY of EFLAGS has to use the stack, and if /// we don't adjust the stack we clobber the first frame index. /// See X86InstrInfo::copyPhysReg. -bool X86TargetLowering::hasCopyImplyingStackAdjustment( - MachineFunction *MF) const { - const MachineRegisterInfo &MRI = MF->getRegInfo(); - +static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); return any_of(MRI.reg_instructions(X86::EFLAGS), [](const MachineInstr &RI) { return RI.isCopy(); }); } +void X86TargetLowering::finalizeLowering(MachineFunction &MF) const { + if (hasCopyImplyingStackAdjustment(MF)) { + MachineFrameInfo &MFI = MF.getFrameInfo(); + MFI.setHasCopyImplyingStackAdjustment(true); + } + + TargetLoweringBase::finalizeLowering(MF); +} + /// This method query the target whether it is beneficial for dag combiner to /// promote the specified node. If true, it should return the desired promotion /// type by reference. @@ -34217,6 +36082,7 @@ TargetLowering::ConstraintWeight switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + LLVM_FALLTHROUGH; case 'R': case 'q': case 'Q': @@ -34568,6 +36434,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR64RegClass); break; } + LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) @@ -34737,7 +36604,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. - if (Res.second->hasType(VT) || VT == MVT::Other) + if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should @@ -34775,11 +36642,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64RegClass; - else if (X86::VR128RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT)) Res.second = &X86::VR128RegClass; - else if (X86::VR256RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT)) Res.second = &X86::VR256RegClass; - else if (X86::VR512RegClass.hasType(VT)) + else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; @@ -34819,7 +36686,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. @@ -34827,8 +36694,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize); + bool OptSize = + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } @@ -34884,3 +36751,22 @@ void X86TargetLowering::insertCopiesSplitCSR( bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } + +/// Returns the name of the symbol used to emit stack probes or the empty +/// string if not applicable. +StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + // If the function specifically requests stack probes, emit them. + if (MF.getFunction()->hasFnAttribute("probe-stack")) + return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString(); + + // Generally, if we aren't on Windows, the platform ABI does not include + // support for stack probes, so don't emit them. + if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO()) + return ""; + + // We need a stack probe to conform to the Windows ABI. Choose the right + // symbol. + if (Subtarget.is64Bit()) + return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; + return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; +} |