summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp6572
1 files changed, 4229 insertions, 2343 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index de6c9a6..957b46c40 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
@@ -52,7 +53,9 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
@@ -70,6 +73,30 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+ "x86-experimental-pref-loop-alignment", cl::init(4),
+ cl::desc("Sets the preferable loop alignment for experiments "
+ "(the last x86-experimental-pref-loop-alignment bits"
+ " of the loop header PC will be 0)."),
+ cl::Hidden);
+
+static cl::opt<bool> MulConstantOptimization(
+ "mul-constant-optimization", cl::init(true),
+ cl::desc("Replace 'mul x, Const' with more effective instructions like "
+ "SHIFT, LEA, etc."),
+ cl::Hidden);
+
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+ const char *Msg) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ DAG.getContext()->diagnose(
+ DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
@@ -290,16 +317,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UREM, VT, Expand);
}
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
- if (VT == MVT::i64 && !Subtarget.is64Bit())
- continue;
- // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
- setOperationAction(ISD::ADDC, VT, Custom);
- setOperationAction(ISD::ADDE, VT, Custom);
- setOperationAction(ISD::SUBC, VT, Custom);
- setOperationAction(ISD::SUBE, VT, Custom);
- }
-
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
@@ -401,8 +418,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
continue;
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::SETCCE, VT, Custom);
}
+
+ // Custom action for SELECT MMX and expand action for SELECT_CC MMX
+ setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
+
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
@@ -427,7 +448,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ExternalSymbol , VT, Custom);
setOperationAction(ISD::BlockAddress , VT, Custom);
}
- // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+
+ // 64-bit shl, sra, srl (iff 32-bit x86)
for (auto VT : { MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
@@ -648,7 +670,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
- setOperationAction(ISD::FPOWI, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
@@ -775,29 +796,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
- setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
- setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
- setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
-
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
- setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
- setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
-
- setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
- setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
- setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
- setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ }
- // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
@@ -872,22 +882,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
- for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- }
-
- // In the customized shift lowering, the legal cases in AVX2 will be
- // recognized.
- for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
+ // In the customized shift lowering, the legal v4i32/v2i64 cases
+ // in AVX2 will be recognized.
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
@@ -922,6 +929,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
+ for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
+ }
+
for (MVT VT : MVT::integer_vector_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
@@ -929,19 +941,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
-
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+ for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+ setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
+ }
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
@@ -1005,36 +1012,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
- for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+ // In the customized shift lowering, the legal v8i32/v4i64 cases
+ // in AVX2 will be recognized.
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
}
- setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
- setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
- setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
- setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
-
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
+ for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ }
+
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
@@ -1065,6 +1067,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+ setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1081,27 +1084,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
- setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
-
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
- }
-
- // In the customized shift lowering, the legal cases in AVX2 will be
- // recognized.
- for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
+ for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+ setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
+ }
}
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
@@ -1126,7 +1116,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
@@ -1149,7 +1139,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
- addRegisterClass(MVT::i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
@@ -1164,16 +1154,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
}
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
- setOperationAction(ISD::SETCC, MVT::i1, Custom);
- setOperationAction(ISD::SETCCE, MVT::i1, Custom);
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
- setOperationAction(ISD::XOR, MVT::i1, Legal);
- setOperationAction(ISD::OR, MVT::i1, Legal);
- setOperationAction(ISD::AND, MVT::i1, Legal);
- setOperationAction(ISD::SUB, MVT::i1, Custom);
- setOperationAction(ISD::ADD, MVT::i1, Custom);
- setOperationAction(ISD::MUL, MVT::i1, Custom);
for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
@@ -1242,27 +1222,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
- setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
- setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
- if (Subtarget.hasDQI()) {
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+ if (Subtarget.hasDQI()) {
+ for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Legal);
+ setOperationAction(ISD::UINT_TO_FP, VT, Legal);
+ setOperationAction(ISD::FP_TO_SINT, VT, Legal);
+ setOperationAction(ISD::FP_TO_UINT, VT, Legal);
+ }
if (Subtarget.hasVLX()) {
// Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
@@ -1296,8 +1265,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
}
- setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
@@ -1310,11 +1277,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
}
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
@@ -1330,48 +1297,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
-
- setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
-
- setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
- setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
- setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
- setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
- setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
- setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
- setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
- setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
-
- setOperationAction(ISD::ADD, MVT::v8i1, Expand);
- setOperationAction(ISD::ADD, MVT::v16i1, Expand);
- setOperationAction(ISD::SUB, MVT::v8i1, Expand);
- setOperationAction(ISD::SUB, MVT::v16i1, Expand);
- setOperationAction(ISD::MUL, MVT::v8i1, Expand);
- setOperationAction(ISD::MUL, MVT::v16i1, Expand);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ setOperationAction(ISD::ABS, MVT::v4i64, Legal);
+ setOperationAction(ISD::ABS, MVT::v2i64, Legal);
+
+ for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ }
+
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::CTPOP, VT, Custom);
- setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTTZ, VT, Custom);
+ }
+
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
+ MVT::v8i64}) {
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
}
// Need to promote to 64-bit even though we have 32-bit masked instructions
@@ -1382,33 +1355,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
if (Subtarget.hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
- setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
-
- setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
- setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
- setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
- setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
-
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
-
- if (Subtarget.hasVLX()) {
- setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
- setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
- setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
- setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
- } else {
- setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
- setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
- setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
- setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
+ MVT::v4i64, MVT::v8i64}) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
}
-
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
} // Subtarget.hasCDI()
if (Subtarget.hasDQI()) {
@@ -1418,6 +1370,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
+ if (Subtarget.hasVPOPCNTDQ()) {
+ // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
+ // version of popcntd/q.
+ for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
+ MVT::v4i32, MVT::v2i64})
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+
// Custom lower several nodes.
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
@@ -1428,7 +1388,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (result) is 256-bit but the source is 512-bit wide.
// 128-bit was made Custom under AVX1.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v8f32, MVT::v4f64 })
+ MVT::v8f32, MVT::v4f64, MVT::v1i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
MVT::v16i1, MVT::v32i1, MVT::v64i1 })
@@ -1438,10 +1398,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Legal);
@@ -1460,12 +1420,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
- setOperationAction(ISD::ADD, MVT::v32i1, Expand);
- setOperationAction(ISD::ADD, MVT::v64i1, Expand);
- setOperationAction(ISD::SUB, MVT::v32i1, Expand);
- setOperationAction(ISD::SUB, MVT::v64i1, Expand);
- setOperationAction(ISD::MUL, MVT::v32i1, Expand);
- setOperationAction(ISD::MUL, MVT::v64i1, Expand);
+ setOperationAction(ISD::ADD, MVT::v32i1, Custom);
+ setOperationAction(ISD::ADD, MVT::v64i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i1, Custom);
+ setOperationAction(ISD::SUB, MVT::v64i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v32i1, Custom);
+ setOperationAction(ISD::MUL, MVT::v64i1, Custom);
setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
@@ -1479,8 +1439,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
@@ -1502,8 +1462,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
@@ -1515,15 +1473,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
- setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
- setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
- setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
- setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
- setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
- setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
-
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
@@ -1545,7 +1494,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
@@ -1553,6 +1503,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
@@ -1574,9 +1528,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
- setOperationAction(ISD::ADD, VT, Expand);
- setOperationAction(ISD::SUB, VT, Expand);
- setOperationAction(ISD::MUL, VT, Expand);
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::TRUNCATE, VT, Custom);
@@ -1626,6 +1580,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::USUBO, VT, Custom);
setOperationAction(ISD::SMULO, VT, Custom);
setOperationAction(ISD::UMULO, VT, Custom);
+
+ // Support carry in as value rather than glue.
+ setOperationAction(ISD::ADDCARRY, VT, Custom);
+ setOperationAction(ISD::SUBCARRY, VT, Custom);
+ setOperationAction(ISD::SETCCCARRY, VT, Custom);
}
if (!Subtarget.is64Bit()) {
@@ -1671,6 +1630,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
@@ -1696,6 +1656,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+ setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -1712,7 +1674,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
- setPrefLoopAlignment(4); // 2^4 bytes.
+
+ // TODO: These control memcmp expansion in CGP and could be raised higher, but
+ // that needs to benchmarked and balanced with the potential use of vector
+ // load/store types (PR33329, PR33914).
+ MaxLoadsPerMemcmp = 2;
+ MaxLoadsPerMemcmpOptSize = 2;
+
+ // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+ setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
@@ -1742,7 +1712,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
if (!VT.isVector())
- return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+ return MVT::i8;
if (VT.isSimple()) {
MVT VVT = VT.getSimpleVT();
@@ -1933,6 +1903,34 @@ bool X86TargetLowering::useSoftFloat() const {
return Subtarget.useSoftFloat();
}
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+ ArgListTy &Args) const {
+
+ // Only relabel X86-32 for C / Stdcall CCs.
+ if (Subtarget.is64Bit())
+ return;
+ if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+ return;
+ unsigned ParamRegs = 0;
+ if (auto *M = MF->getFunction()->getParent())
+ ParamRegs = M->getNumberRegisterParameters();
+
+ // Mark the first N int arguments as having reg
+ for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+ Type *T = Args[Idx].Ty;
+ if (T->isPointerTy() || T->isIntegerTy())
+ if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+ unsigned numRegs = 1;
+ if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+ numRegs = 2;
+ if (ParamRegs < numRegs)
+ return;
+ ParamRegs -= numRegs;
+ Args[Idx].IsInReg = true;
+ }
+ }
+}
+
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
@@ -2001,21 +1999,37 @@ unsigned X86TargetLowering::getAddressSpace() const {
return 256;
}
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
- // glibc has a special slot for the stack guard in tcbhead_t, use it instead
- // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
- if (!Subtarget.isTargetGlibc())
- return TargetLowering::getIRStackGuard(IRB);
-
- // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
- // %gs:0x14 on i386
- unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
- unsigned AddressSpace = getAddressSpace();
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+ return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+ (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+ unsigned Offset, unsigned AddressSpace) {
return ConstantExpr::getIntToPtr(
ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
}
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+ // tcbhead_t; use it instead of the usual global variable (see
+ // sysdeps/{i386,x86_64}/nptl/tls.h)
+ if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+ if (Subtarget.isTargetFuchsia()) {
+ // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+ return SegmentOffset(IRB, 0x10, getAddressSpace());
+ } else {
+ // %fs:0x28, unless we're using a Kernel code model, in which case
+ // it's %gs:0x28. gs:0x14 on i386.
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+ return SegmentOffset(IRB, Offset, getAddressSpace());
+ }
+ }
+
+ return TargetLowering::getIRStackGuard(IRB);
+}
+
void X86TargetLowering::insertSSPDeclarations(Module &M) const {
// MSVC CRT provides functionalities for stack protection.
if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -2027,13 +2041,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
auto *SecurityCheckCookie = cast<Function>(
M.getOrInsertFunction("__security_check_cookie",
Type::getVoidTy(M.getContext()),
- Type::getInt8PtrTy(M.getContext()), nullptr));
+ Type::getInt8PtrTy(M.getContext())));
SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
return;
}
- // glibc has a special slot for the stack guard.
- if (Subtarget.isTargetGlibc())
+ // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+ if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
return;
TargetLowering::insertSSPDeclarations(M);
}
@@ -2056,21 +2070,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (Subtarget.getTargetTriple().isOSContiki())
return getDefaultSafeStackPointerLocation(IRB, false);
- if (!Subtarget.isTargetAndroid())
- return TargetLowering::getSafeStackPointerLocation(IRB);
-
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- unsigned AddressSpace, Offset;
+ if (Subtarget.isTargetAndroid()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ // %gs:0x24 on i386
+ unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+ return SegmentOffset(IRB, Offset, getAddressSpace());
+ }
- // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
- // %gs:0x24 on i386
- Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
- AddressSpace = getAddressSpace();
- return ConstantExpr::getIntToPtr(
- ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
- Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+ // Fuchsia is similar.
+ if (Subtarget.isTargetFuchsia()) {
+ // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+ return SegmentOffset(IRB, 0x18, getAddressSpace());
+ }
+
+ return TargetLowering::getSafeStackPointerLocation(IRB);
}
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2160,6 +2176,12 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ // In some cases we need to disable registers from the default CSR list.
+ // For example, when they are used for argument passing.
+ bool ShouldDisableCalleeSavedRegister =
+ CallConv == CallingConv::X86_RegCall ||
+ MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
+
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
report_fatal_error("X86 interrupts may not return any value");
@@ -2179,6 +2201,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
assert(VA.isRegLoc() && "Can only return in registers!");
+
+ // Add the register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
SDValue ValToCopy = OutVals[OutsIndex];
EVT ValVT = ValToCopy.getValueType();
@@ -2203,15 +2230,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (ValVT == MVT::f64 &&
+ (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+ // Likewise we can't return F64 values with SSE1 only. gcc does so, but
+ // llvm-gcc has never done it right and no one has noticed, so this
+ // should be OK for now.
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
- // Likewise we can't return F64 values with SSE1 only. gcc does so, but
- // llvm-gcc has never done it right and no one has noticed, so this
- // should be OK for now.
- if (ValVT == MVT::f64 &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
- report_fatal_error("SSE2 register return with SSE2 disabled");
// Returns in ST0/ST1 are handled specially: these are pushed as operands to
// the RET instruction and handled by the FP Stackifier.
@@ -2253,6 +2282,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(2 == RegsToPass.size() &&
"Expecting two registers after Pass64BitArgInRegs");
+
+ // Add the second register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
@@ -2309,6 +2342,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// RAX/EAX now acts like a return value.
RetOps.push_back(
DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+ // Add the returned register to the CalleeSaveDisableRegs list.
+ if (ShouldDisableCalleeSavedRegister)
+ MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
}
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -2444,7 +2481,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
// Convert the i32 type into v32i1 type
Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
- // Concantenate the two values together
+ // Concatenate the two values together
return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
}
@@ -2456,6 +2493,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
SelectionDAG &DAG) {
SDValue ValReturned = ValArg;
+ if (ValVT == MVT::v1i1)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
+
if (ValVT == MVT::v64i1) {
// In 32 bit machine, this case is handled by getv64i1Argument
assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
@@ -2478,7 +2518,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
}
-
return DAG.getBitcast(ValVT, ValReturned);
}
@@ -2488,8 +2527,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
SDValue X86TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
- SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+ uint32_t *RegMask) const {
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget.is64Bit();
@@ -2503,10 +2544,19 @@ SDValue X86TargetLowering::LowerCallResult(
CCValAssign &VA = RVLocs[I];
EVT CopyVT = VA.getLocVT();
+ // In some calling conventions we need to remove the used registers
+ // from the register mask.
+ if (RegMask) {
+ for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+ }
+
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
+ errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2624,7 +2674,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
// C calling conventions:
case CallingConv::C:
- case CallingConv::X86_64_Win64:
+ case CallingConv::Win64:
case CallingConv::X86_64_SysV:
// Callee pop conventions:
case CallingConv::X86_ThisCall:
@@ -2643,13 +2693,13 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
}
-bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
auto Attr =
CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
if (!CI->isTailCall() || Attr.getValueAsString() == "true")
return false;
- CallSite CS(CI);
+ ImmutableCallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
@@ -2669,6 +2719,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
+ MVT PtrVT = getPointerTy(DAG.getDataLayout());
// If value is passed by pointer we have address passed instead of the value
// itself. No need to extend if the mask value and location share the same
@@ -2686,13 +2737,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
// taken by a return address.
int Offset = 0;
if (CallConv == CallingConv::X86_INTR) {
- const X86Subtarget& Subtarget =
- static_cast<const X86Subtarget&>(DAG.getSubtarget());
// X86 interrupts may take one or two arguments.
// On the stack there will be no return address as in regular call.
// Offset of last argument need to be set to -4/-8 bytes.
// Where offset of the first argument out of two, should be set to 0 bytes.
Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ if (Subtarget.is64Bit() && Ins.size() == 2) {
+ // The stack pointer needs to be realigned for 64 bit handlers with error
+ // code, so the argument offset changes by 8 bytes.
+ Offset += 8;
+ }
}
// FIXME: For now, all byval parameter objects are marked mutable. This can be
@@ -2707,30 +2761,74 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
if (CallConv == CallingConv::X86_INTR) {
MFI.setObjectOffset(FI, Offset);
}
- return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- } else {
- int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
- VA.getLocMemOffset(), isImmutable);
-
- // Set SExt or ZExt flag.
- if (VA.getLocInfo() == CCValAssign::ZExt) {
- MFI.setObjectZExt(FI, true);
- } else if (VA.getLocInfo() == CCValAssign::SExt) {
- MFI.setObjectSExt(FI, true);
+ return DAG.getFrameIndex(FI, PtrVT);
+ }
+
+ // This is an argument in memory. We might be able to perform copy elision.
+ if (Flags.isCopyElisionCandidate()) {
+ EVT ArgVT = Ins[i].ArgVT;
+ SDValue PartAddr;
+ if (Ins[i].PartOffset == 0) {
+ // If this is a one-part value or the first part of a multi-part value,
+ // create a stack object for the entire argument value type and return a
+ // load from our portion of it. This assumes that if the first part of an
+ // argument is in memory, the rest will also be in memory.
+ int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+ /*Immutable=*/false);
+ PartAddr = DAG.getFrameIndex(FI, PtrVT);
+ return DAG.getLoad(
+ ValVT, dl, Chain, PartAddr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ } else {
+ // This is not the first piece of an argument in memory. See if there is
+ // already a fixed stack object including this offset. If so, assume it
+ // was created by the PartOffset == 0 branch above and create a load from
+ // the appropriate offset into it.
+ int64_t PartBegin = VA.getLocMemOffset();
+ int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+ int FI = MFI.getObjectIndexBegin();
+ for (; MFI.isFixedObjectIndex(FI); ++FI) {
+ int64_t ObjBegin = MFI.getObjectOffset(FI);
+ int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+ if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+ break;
+ }
+ if (MFI.isFixedObjectIndex(FI)) {
+ SDValue Addr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+ DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+ return DAG.getLoad(
+ ValVT, dl, Chain, Addr,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+ Ins[i].PartOffset));
+ }
}
+ }
- // Adjust SP offset of interrupt parameter.
- if (CallConv == CallingConv::X86_INTR) {
- MFI.setObjectOffset(FI, Offset);
- }
+ int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+ VA.getLocMemOffset(), isImmutable);
- SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
- SDValue Val = DAG.getLoad(
- ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
- return ExtendedInMem ?
- DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+ // Set SExt or ZExt flag.
+ if (VA.getLocInfo() == CCValAssign::ZExt) {
+ MFI.setObjectZExt(FI, true);
+ } else if (VA.getLocInfo() == CCValAssign::SExt) {
+ MFI.setObjectSExt(FI, true);
}
+
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI.setObjectOffset(FI, Offset);
+ }
+
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ SDValue Val = DAG.getLoad(
+ ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ return ExtendedInMem
+ ? (VA.getValVT().isVector()
+ ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
+ : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
+ : Val;
}
// FIXME: Get this from tablegen.
@@ -2781,12 +2879,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
+#ifndef NDEBUG
static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
[](const CCValAssign &A, const CCValAssign &B) -> bool {
return A.getValNo() < B.getValNo();
});
}
+#endif
SDValue X86TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -2836,8 +2936,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
// The next loop assumes that the locations are in the same order of the
// input arguments.
- if (!isSortedByValueNo(ArgLocs))
- llvm_unreachable("Argument Location list must be sorted before lowering");
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
SDValue ArgValue;
for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -2853,7 +2953,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
"Currently the only custom case is when we split v64i1 to 2 regs");
// v64i1 values, in regcall calling convention, that are
- // compiled to 32 bit arch, are splited up into two registers.
+ // compiled to 32 bit arch, are split up into two registers.
ArgValue =
getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
} else {
@@ -2878,7 +2978,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
RC = &X86::VR64RegClass;
- else if (RegVT == MVT::i1)
+ else if (RegVT == MVT::v1i1)
RC = &X86::VK1RegClass;
else if (RegVT == MVT::v8i1)
RC = &X86::VK8RegClass;
@@ -3107,8 +3207,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
- // X86 interrupts must pop the error code if present
- FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+ // X86 interrupts must pop the error code (and the alignment padding) if
+ // present.
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
@@ -3146,6 +3247,13 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
}
+ if (CallConv == CallingConv::X86_RegCall ||
+ Fn->hasFnAttribute("no_caller_saved_registers")) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
+ MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+ }
+
return Chain;
}
@@ -3232,6 +3340,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsSibcall = false;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+ const CallInst *CI =
+ CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+ const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
+ bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
+ (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3333,8 +3446,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
if (!IsSibcall)
- Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+ NumBytes - NumBytesToPush, dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
@@ -3348,8 +3461,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// The next loop assumes that the locations are in the same order of the
// input arguments.
- if (!isSortedByValueNo(ArgLocs))
- llvm_unreachable("Argument Location list must be sorted before lowering");
+ assert(isSortedByValueNo(ArgLocs) &&
+ "Argument Location list must be sorted before lowering");
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
@@ -3517,7 +3630,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (VA.isRegLoc()) {
if (VA.needsCustom()) {
assert((CallConv == CallingConv::X86_RegCall) &&
- "Expecting custome case only in regcall calling convention");
+ "Expecting custom case only in regcall calling convention");
// This means that we are in special case where one argument was
// passed through two register locations - Skip the next location
++I;
@@ -3644,7 +3757,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+ // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
+ // set X86_INTR calling convention because it has the same CSR mask
+ // (same preserved registers).
+ const uint32_t *Mask = RegInfo->getCallPreservedMask(
+ MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
// If this is an invoke in a 32-bit function using a funclet-based
@@ -3662,7 +3779,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Mask = RegInfo->getNoPreservedMask();
}
- Ops.push_back(DAG.getRegisterMask(Mask));
+ // Define a new register mask from the existing mask.
+ uint32_t *RegMask = nullptr;
+
+ // In some calling conventions we need to remove the used physical registers
+ // from the reg mask.
+ if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+ // Allocate a new Reg Mask and copy Mask.
+ RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
+ unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+ memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+
+ // Make sure all sub registers of the argument registers are reset
+ // in the RegMask.
+ for (auto const &RegPair : RegsToPass)
+ for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+ SubRegs.isValid(); ++SubRegs)
+ RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+ // Create the RegMask Operand according to our updated mask.
+ Ops.push_back(DAG.getRegisterMask(RegMask));
+ } else {
+ // Create the RegMask Operand according to the static mask.
+ Ops.push_back(DAG.getRegisterMask(Mask));
+ }
if (InFlag.getNode())
Ops.push_back(InFlag);
@@ -3715,8 +3857,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Handle result values, copying them out of physregs into vregs that we
// return.
- return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
- Ins, dl, DAG, InVals);
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+ InVals, RegMask);
}
//===----------------------------------------------------------------------===//
@@ -3847,6 +3989,13 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
if (Offset != MFI.getObjectOffset(FI))
return false;
+ // If this is not byval, check that the argument stack object is immutable.
+ // inalloca and argument copy elision can create mutable argument stack
+ // objects. Byval objects can be mutated, but a byval call intends to pass the
+ // mutated memory.
+ if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
+ return false;
+
if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
// If the argument location is wider than the argument type, check that any
// extension flags match.
@@ -4088,6 +4237,8 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PSHUFLW:
case X86ISD::SHUFP:
case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -4132,6 +4283,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
return true;
// 'Faux' Target Shuffles.
case ISD::AND:
+ case X86ISD::ANDNP:
return true;
}
}
@@ -4448,6 +4600,11 @@ bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
}
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ return true;
+}
+
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
if (!Subtarget.hasBMI())
return false;
@@ -4460,6 +4617,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
return true;
}
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+ MVT VT = MVT::getIntegerVT(NumBits);
+ if (isTypeLegal(VT))
+ return VT;
+
+ // PMOVMSKB can handle this.
+ if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+ return MVT::v16i8;
+
+ // VPMOVMSKB can handle this.
+ if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+ return MVT::v32i8;
+
+ // TODO: Allow 64-bit type for 32-bit target.
+ // TODO: 512-bit types should be allowed, but make sure that those
+ // cases are handled in combineVectorSizedSetCCEquality().
+
+ return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
/// Val is the undef sentinel value or equal to the specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return ((Val == SM_SentinelUndef) || (Val == CmpVal));
@@ -4555,28 +4732,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
SmallVectorImpl<int> &WidenedMask) {
WidenedMask.assign(Mask.size() / 2, 0);
for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ int M0 = Mask[i];
+ int M1 = Mask[i + 1];
+
// If both elements are undef, its trivial.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
WidenedMask[i / 2] = SM_SentinelUndef;
continue;
}
// Check for an undef mask and a mask value properly aligned to fit with
// a pair of values. If we find such a case, use the non-undef mask's value.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
- Mask[i + 1] % 2 == 1) {
- WidenedMask[i / 2] = Mask[i + 1] / 2;
+ if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+ WidenedMask[i / 2] = M1 / 2;
continue;
}
- if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
- WidenedMask[i / 2] = Mask[i] / 2;
+ if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+ WidenedMask[i / 2] = M0 / 2;
continue;
}
// When zeroing, we need to spread the zeroing across both lanes to widen.
- if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
- if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
- (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+ if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+ (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
WidenedMask[i / 2] = SM_SentinelZero;
continue;
}
@@ -4585,9 +4764,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
// Finally check if the two mask values are adjacent and aligned with
// a pair.
- if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
- Mask[i] + 1 == Mask[i + 1]) {
- WidenedMask[i / 2] = Mask[i] / 2;
+ if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+ WidenedMask[i / 2] = M0 / 2;
continue;
}
@@ -4608,7 +4786,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
SmallVectorImpl<int> &ScaledMask) {
assert(0 < Scale && "Unexpected scaling factor");
int NumElts = Mask.size();
- ScaledMask.assign(NumElts * Scale, -1);
+ ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
for (int i = 0; i != NumElts; ++i) {
int M = Mask[i];
@@ -4634,14 +4812,10 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
return false;
// The index should be aligned on a vecWidth-bit boundary.
- uint64_t Index =
- cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-
+ uint64_t Index = N->getConstantOperandVal(1);
MVT VT = N->getSimpleValueType(0);
unsigned ElSize = VT.getScalarSizeInBits();
- bool Result = (Index * ElSize) % vecWidth == 0;
-
- return Result;
+ return (Index * ElSize) % vecWidth == 0;
}
/// Return true if the specified INSERT_SUBVECTOR
@@ -4651,15 +4825,12 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
return false;
- // The index should be aligned on a vecWidth-bit boundary.
- uint64_t Index =
- cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+ // The index should be aligned on a vecWidth-bit boundary.
+ uint64_t Index = N->getConstantOperandVal(2);
MVT VT = N->getSimpleValueType(0);
unsigned ElSize = VT.getScalarSizeInBits();
- bool Result = (Index * ElSize) % vecWidth == 0;
-
- return Result;
+ return (Index * ElSize) % vecWidth == 0;
}
bool X86::isVINSERT128Index(SDNode *N) {
@@ -4683,13 +4854,9 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
"Illegal extract subvector for VEXTRACT");
- uint64_t Index =
- cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-
+ uint64_t Index = N->getConstantOperandVal(1);
MVT VecVT = N->getOperand(0).getSimpleValueType();
- MVT ElVT = VecVT.getVectorElementType();
-
- unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
return Index / NumElemsPerChunk;
}
@@ -4698,13 +4865,9 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
"Illegal insert subvector for VINSERT");
- uint64_t Index =
- cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-
+ uint64_t Index = N->getConstantOperandVal(2);
MVT VecVT = N->getSimpleValueType(0);
- MVT ElVT = VecVT.getVectorElementType();
-
- unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+ unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
return Index / NumElemsPerChunk;
}
@@ -4737,9 +4900,9 @@ bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
}
-// Build a vector of constants
+// Build a vector of constants.
// Use an UNDEF node if MaskElt == -1.
-// Spilt 64-bit constants in the 32-bit mode.
+// Split 64-bit constants in the 32-bit mode.
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
const SDLoc &dl, bool IsMask = false) {
@@ -4770,9 +4933,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
return ConstsNode;
}
-static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
- assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+ assert(Bits.size() == Undefs.getBitWidth() &&
+ "Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
bool Split = false;
@@ -4844,10 +5008,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
- // Extract from UNDEF is UNDEF.
- if (Vec.isUndef())
- return DAG.getUNDEF(ResultVT);
-
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
@@ -4858,8 +5018,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+ return DAG.getBuildVector(
+ ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -4918,50 +5078,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG, const SDLoc &dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
-
- // For insertion into the zero index (low half) of a 256-bit vector, it is
- // more efficient to generate a blend with immediate instead of an insert*128.
- // We are still creating an INSERT_SUBVECTOR below with an undef node to
- // extend the subvector to the size of the result vector. Make sure that
- // we are not recursing on that node by checking for undef here.
- if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
- !Result.isUndef()) {
- EVT ResultVT = Result.getValueType();
- SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
- SDValue Undef = DAG.getUNDEF(ResultVT);
- SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
- Vec, ZeroIndex);
-
- // The blend instruction, and therefore its mask, depend on the data type.
- MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
- if (ScalarType.isFloatingPoint()) {
- // Choose either vblendps (float) or vblendpd (double).
- unsigned ScalarSize = ScalarType.getSizeInBits();
- assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
- unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
- SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
- return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
- }
-
- const X86Subtarget &Subtarget =
- static_cast<const X86Subtarget &>(DAG.getSubtarget());
-
- // AVX2 is needed for 256-bit integer blend support.
- // Integers must be cast to 32-bit because there is only vpblendd;
- // vpblendw can't be used for this because it has a handicapped mask.
-
- // If we don't have AVX2, then cast to float. Using a wrong domain blend
- // is still more efficient than using the wrong domain vinsertf128 that
- // will be created by InsertSubVector().
- MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
-
- SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
- Result = DAG.getBitcast(CastVT, Result);
- Vec256 = DAG.getBitcast(CastVT, Vec256);
- Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
- return DAG.getBitcast(ResultVT, Vec256);
- }
-
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}
@@ -4971,6 +5087,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
}
+// Return true if the instruction zeroes the unused upper part of the
+// destination and accepts mask.
+static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
+ switch (Opcode) {
+ default:
+ return false;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return true;
+ }
+}
+
/// Insert i1-subvector to i1-vector.
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -5003,6 +5133,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// 3. Subvector should be inserted in the middle (for example v2i1
// to v16i1, index 2)
+ // If this node widens - by concatenating zeroes - the type of the result
+ // of a node with instruction that zeroes all upper (irrelevant) bits of the
+ // output register, mark this node as legal to enable replacing them with
+ // the v8i1 version of the previous instruction during instruction selection.
+ // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
+ // while zeroing all the upper remaining 60 bits of the register. if the
+ // result of such instruction is inserted into an allZeroVector, then we can
+ // safely remove insert_vector (in instruction selection) as the cmp instr
+ // already zeroed the rest of the register.
+ if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
+ (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
+ (SubVec.getOpcode() == ISD::AND &&
+ (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
+ isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
+ return Op;
+
// extend to natively supported kshift
MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
MVT WideOpVT = OpVT;
@@ -5023,7 +5169,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (Vec.isUndef()) {
if (IdxVal != 0) {
SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+ WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+ ShiftBits);
}
return ExtractSubVec(WideSubVec);
}
@@ -5032,9 +5179,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
return ExtractSubVec(Vec);
}
@@ -5043,8 +5190,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
@@ -5056,12 +5203,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
// Zero upper bits of the Vec
- WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+ WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
return ExtractSubVec(Vec);
}
@@ -5094,26 +5241,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
}
/// Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
/// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
- SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
"Expected a 128/256/512-bit vector type");
APInt Ones = APInt::getAllOnesValue(32);
unsigned NumElts = VT.getSizeInBits() / 32;
- SDValue Vec;
- if (!Subtarget.hasInt256() && NumElts == 8) {
- Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
- Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
- } else {
- Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
- }
+ SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
return DAG.getBitcast(VT, Vec);
}
+static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+ SelectionDAG &DAG) {
+ EVT InVT = In.getValueType();
+ assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
+
+ if (VT.is128BitVector() && InVT.is128BitVector())
+ return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
+ : DAG.getZeroExtendVectorInReg(In, DL, VT);
+
+ // For 256-bit vectors, we only need the lower (128-bit) input half.
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
+ int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+ In = extractSubVector(In, 0, DAG, DL,
+ std::max(128, (int)VT.getSizeInBits() / Scale));
+ }
+
+ return DAG.getNode(Opc, DL, VT, In);
+}
+
/// Generate unpacklo/unpackhi shuffle mask.
static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
bool Unary) {
@@ -5199,9 +5358,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
// Extract raw constant bits from constant pools.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
- SmallBitVector &UndefElts,
- SmallVectorImpl<APInt> &EltBits) {
- assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+ APInt &UndefElts,
+ SmallVectorImpl<APInt> &EltBits,
+ bool AllowWholeUndefs = true,
+ bool AllowPartialUndefs = true) {
assert(EltBits.empty() && "Expected an empty EltBits vector");
Op = peekThroughBitcasts(Op);
@@ -5211,174 +5371,173 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
unsigned NumElts = SizeInBits / EltSizeInBits;
- // Extract all the undef/constant element data and pack into single bitsets.
- APInt UndefBits(SizeInBits, 0);
- APInt MaskBits(SizeInBits, 0);
+ // Bitcast a source array of element bits to the target size.
+ auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
+ unsigned NumSrcElts = UndefSrcElts.getBitWidth();
+ unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
+ assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
+ "Constant bit sizes don't match");
+
+ // Don't split if we don't allow undef bits.
+ bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+ if (UndefSrcElts.getBoolValue() && !AllowUndefs)
+ return false;
+
+ // If we're already the right size, don't bother bitcasting.
+ if (NumSrcElts == NumElts) {
+ UndefElts = UndefSrcElts;
+ EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
+ return true;
+ }
+
+ // Extract all the undef/constant element data and pack into single bitsets.
+ APInt UndefBits(SizeInBits, 0);
+ APInt MaskBits(SizeInBits, 0);
- // Split the undef/constant single bitset data into the target elements.
- auto SplitBitData = [&]() {
- UndefElts = SmallBitVector(NumElts, false);
+ for (unsigned i = 0; i != NumSrcElts; ++i) {
+ unsigned BitOffset = i * SrcEltSizeInBits;
+ if (UndefSrcElts[i])
+ UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+ MaskBits.insertBits(SrcEltBits[i], BitOffset);
+ }
+
+ // Split the undef/constant single bitset data into the target elements.
+ UndefElts = APInt(NumElts, 0);
EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
for (unsigned i = 0; i != NumElts; ++i) {
- APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
- UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+ unsigned BitOffset = i * EltSizeInBits;
+ APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
- // Only treat an element as UNDEF if all bits are UNDEF, otherwise
- // treat it as zero.
+ // Only treat an element as UNDEF if all bits are UNDEF.
if (UndefEltBits.isAllOnesValue()) {
- UndefElts[i] = true;
+ if (!AllowWholeUndefs)
+ return false;
+ UndefElts.setBit(i);
continue;
}
- APInt Bits = MaskBits.lshr(i * EltSizeInBits);
- Bits = Bits.zextOrTrunc(EltSizeInBits);
+ // If only some bits are UNDEF then treat them as zero (or bail if not
+ // supported).
+ if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+ return false;
+
+ APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
EltBits[i] = Bits.getZExtValue();
}
return true;
};
- auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
- APInt &Undefs) {
+ // Collect constant bits and insert into mask/undef bit masks.
+ auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+ unsigned UndefBitIndex) {
if (!Cst)
return false;
- unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
if (isa<UndefValue>(Cst)) {
- Mask = APInt::getNullValue(SizeInBits);
- Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+ Undefs.setBit(UndefBitIndex);
return true;
}
if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
- Mask = CInt->getValue().zextOrTrunc(SizeInBits);
- Undefs = APInt::getNullValue(SizeInBits);
+ Mask = CInt->getValue();
return true;
}
if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
- Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
- Undefs = APInt::getNullValue(SizeInBits);
+ Mask = CFP->getValueAPF().bitcastToAPInt();
return true;
}
return false;
};
+ // Extract constant bits from build vector.
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ const SDValue &Src = Op.getOperand(i);
+ if (Src.isUndef()) {
+ UndefSrcElts.setBit(i);
+ continue;
+ }
+ auto *Cst = cast<ConstantSDNode>(Src);
+ SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+ }
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
// Extract constant bits from constant pool vector.
if (auto *Cst = getTargetConstantFromNode(Op)) {
Type *CstTy = Cst->getType();
if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
return false;
- unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
- for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
- APInt Bits, Undefs;
- if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+ unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
+ unsigned NumSrcElts = CstTy->getVectorNumElements();
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
+ UndefSrcElts, i))
return false;
- MaskBits |= Bits.shl(i * CstEltSizeInBits);
- UndefBits |= Undefs.shl(i * CstEltSizeInBits);
- }
- return SplitBitData();
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
// Extract constant bits from a broadcasted constant pool scalar.
if (Op.getOpcode() == X86ISD::VBROADCAST &&
- EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+ EltSizeInBits <= VT.getScalarSizeInBits()) {
if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
- APInt Bits, Undefs;
- if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
- unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
- unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
- for (unsigned i = 0; i != NumBroadcastElts; ++i) {
- MaskBits |= Bits.shl(i * NumBroadcastBits);
- UndefBits |= Undefs.shl(i * NumBroadcastBits);
- }
- return SplitBitData();
+ unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+ if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
+ if (UndefSrcElts[0])
+ UndefSrcElts.setBits(0, NumSrcElts);
+ SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+ return CastBitData(UndefSrcElts, SrcEltBits);
}
}
}
+ // Extract a rematerialized scalar constant insertion.
+ if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+ Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits;
+ auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+ SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
+ SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
return false;
}
-// TODO: Merge more of this with getTargetConstantBitsFromNode.
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
SmallVectorImpl<uint64_t> &RawMask) {
- MaskNode = peekThroughBitcasts(MaskNode);
-
- MVT VT = MaskNode.getSimpleValueType();
- assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
- unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
-
- // Split an APInt element into MaskEltSizeInBits sized pieces and
- // insert into the shuffle mask.
- auto SplitElementToMask = [&](APInt Element) {
- // Note that this is x86 and so always little endian: the low byte is
- // the first byte of the mask.
- int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
- for (int i = 0; i < Split; ++i) {
- APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
- Element = Element.lshr(MaskEltSizeInBits);
- RawMask.push_back(RawElt.getZExtValue());
- }
- };
-
- if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
- // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
- // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
- if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
- return false;
- if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
- const APInt &MaskElement = CN->getAPIntValue();
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
- RawMask.push_back(RawElt.getZExtValue());
- }
- }
- return false;
- }
-
- if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
- MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
- SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
- if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
- if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
- RawMask.push_back(CN->getZExtValue());
- RawMask.append(NumMaskElts - 1, 0);
- return true;
- }
-
- if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
- SplitElementToMask(CN->getAPIntValue());
- RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
- return true;
- }
- }
- return false;
- }
-
- if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
- return false;
-
- // We can always decode if the buildvector is all zero constants,
- // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
- if (all_of(MaskNode->ops(), X86::isZeroNode)) {
- RawMask.append(NumMaskElts, 0);
- return true;
- }
-
- // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
- if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+
+ // Extract the raw target constant bits.
+ // FIXME: We currently don't support UNDEF bits or mask entries.
+ if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+ EltBits, /* AllowWholeUndefs */ false,
+ /* AllowPartialUndefs */ false))
return false;
- for (SDValue Op : MaskNode->ops()) {
- if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
- SplitElementToMask(CN->getAPIntValue());
- else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
- SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
- else
- return false;
- }
+ // Insert the extracted elements into the mask.
+ for (APInt Elt : EltBits)
+ RawMask.push_back(Elt.getZExtValue());
return true;
}
@@ -5405,6 +5564,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::BLENDI:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5416,6 +5576,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
+ case X86ISD::EXTRQI:
+ if (isa<ConstantSDNode>(N->getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ int BitLen = N->getConstantOperandVal(1);
+ int BitIdx = N->getConstantOperandVal(2);
+ DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
+ IsUnary = true;
+ }
+ break;
+ case X86ISD::INSERTQI:
+ if (isa<ConstantSDNode>(N->getOperand(2)) &&
+ isa<ConstantSDNode>(N->getOperand(3))) {
+ int BitLen = N->getConstantOperandVal(2);
+ int BitIdx = N->getConstantOperandVal(3);
+ DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ }
+ break;
case X86ISD::UNPCKH:
DecodeUNPCKHMask(VT, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -5473,8 +5651,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
IsUnary = true;
break;
case X86ISD::VBROADCAST: {
- // We only decode broadcasts of same-sized vectors at the moment.
- if (N->getOperand(0).getValueType() == VT) {
+ SDValue N0 = N->getOperand(0);
+ // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+ // add the pre-extracted value to the Ops vector.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == VT &&
+ N0.getConstantOperandVal(1) == 0)
+ Ops.push_back(N0.getOperand(0));
+
+ // We only decode broadcasts of same-sized vectors, unless the broadcast
+ // came from an extract from the original width. If we found one, we
+ // pushed it the Ops vector above.
+ if (N0.getValueType() == VT || !Ops.empty()) {
DecodeVectorBroadcast(VT, Mask);
IsUnary = true;
break;
@@ -5669,6 +5857,19 @@ static bool setTargetShuffleZeroElements(SDValue N,
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
+ assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+ "Illegal split of shuffle value type");
+ unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+
+ // Extract known constant input data.
+ APInt UndefSrcElts[2];
+ SmallVector<APInt, 32> SrcEltBits[2];
+ bool IsSrcConstant[2] = {
+ getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+ SrcEltBits[0], true, false),
+ getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+ SrcEltBits[1], true, false)};
+
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
int M = Mask[i];
@@ -5677,6 +5878,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
continue;
// Determine shuffle input and normalize the mask.
+ unsigned SrcIdx = M / Size;
SDValue V = M < Size ? V1 : V2;
M %= Size;
@@ -5686,39 +5888,27 @@ static bool setTargetShuffleZeroElements(SDValue N,
continue;
}
- // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
- if (V.getOpcode() != ISD::BUILD_VECTOR)
- continue;
-
- // If the BUILD_VECTOR has fewer elements then the (larger) source
- // element must be UNDEF/ZERO.
- // TODO: Is it worth testing the individual bits of a constant?
- if ((Size % V.getNumOperands()) == 0) {
- int Scale = Size / V->getNumOperands();
- SDValue Op = V.getOperand(M / Scale);
- if (Op.isUndef())
+ // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+ // TODO: We currently only set UNDEF for integer types - floats use the same
+ // registers as vectors and many of the scalar folded loads rely on the
+ // SCALAR_TO_VECTOR pattern.
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ (Size % V.getValueType().getVectorNumElements()) == 0) {
+ int Scale = Size / V.getValueType().getVectorNumElements();
+ int Idx = M / Scale;
+ if (Idx != 0 && !VT.isFloatingPoint())
Mask[i] = SM_SentinelUndef;
- else if (X86::isZeroNode(Op))
+ else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
Mask[i] = SM_SentinelZero;
continue;
}
- // If the BUILD_VECTOR has more elements then all the (smaller) source
- // elements must be all UNDEF or all ZERO.
- if ((V.getNumOperands() % Size) == 0) {
- int Scale = V->getNumOperands() / Size;
- bool AllUndef = true;
- bool AllZero = true;
- for (int j = 0; j < Scale; ++j) {
- SDValue Op = V.getOperand((M * Scale) + j);
- AllUndef &= Op.isUndef();
- AllZero &= X86::isZeroNode(Op);
- }
- if (AllUndef)
+ // Attempt to extract from the source's constant bits.
+ if (IsSrcConstant[SrcIdx]) {
+ if (UndefSrcElts[SrcIdx][M])
Mask[i] = SM_SentinelUndef;
- else if (AllZero)
+ else if (SrcEltBits[SrcIdx][M] == 0)
Mask[i] = SM_SentinelZero;
- continue;
}
}
@@ -5731,7 +5921,8 @@ static bool setTargetShuffleZeroElements(SDValue N,
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SDValue> &Ops) {
+ SmallVectorImpl<SDValue> &Ops,
+ SelectionDAG &DAG) {
Mask.clear();
Ops.clear();
@@ -5744,11 +5935,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
unsigned Opcode = N.getOpcode();
switch (Opcode) {
- case ISD::AND: {
+ case ISD::AND:
+ case X86ISD::ANDNP: {
// Attempt to decode as a per-byte mask.
- SmallBitVector UndefElts;
+ APInt UndefElts;
SmallVector<APInt, 32> EltBits;
- if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ bool IsAndN = (X86ISD::ANDNP == Opcode);
+ uint64_t ZeroMask = IsAndN ? 255 : 0;
+ if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
return false;
for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
if (UndefElts[i]) {
@@ -5758,9 +5954,93 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
uint64_t ByteBits = EltBits[i].getZExtValue();
if (ByteBits != 0 && ByteBits != 255)
return false;
- Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+ Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
+ }
+ Ops.push_back(IsAndN ? N1 : N0);
+ return true;
+ }
+ case ISD::SCALAR_TO_VECTOR: {
+ // Match against a scalar_to_vector of an extract from a vector,
+ // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
+ SDValue N0 = N.getOperand(0);
+ SDValue SrcExtract;
+
+ if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ N0.getOperand(0).getValueType() == VT) {
+ SrcExtract = N0;
+ } else if (N0.getOpcode() == ISD::AssertZext &&
+ N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
+ cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
+ SrcExtract = N0.getOperand(0);
+ assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
+ } else if (N0.getOpcode() == ISD::AssertZext &&
+ N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
+ cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
+ SrcExtract = N0.getOperand(0);
+ assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
+ }
+
+ if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
+ return false;
+
+ SDValue SrcVec = SrcExtract.getOperand(0);
+ EVT SrcVT = SrcVec.getValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
+
+ unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+ if (NumSrcElts <= SrcIdx)
+ return false;
+
+ Ops.push_back(SrcVec);
+ Mask.push_back(SrcIdx);
+ Mask.append(NumZeros, SM_SentinelZero);
+ Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
+ return true;
+ }
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ SDValue InVec = N.getOperand(0);
+ SDValue InScl = N.getOperand(1);
+ uint64_t InIdx = N.getConstantOperandVal(2);
+ assert(InIdx < NumElts && "Illegal insertion index");
+
+ // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
+ if (X86::isZeroNode(InScl)) {
+ Ops.push_back(InVec);
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
+ return true;
}
+
+ // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+ // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
+ unsigned ExOp =
+ (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
+ if (InScl.getOpcode() != ISD::AssertZext ||
+ InScl.getOperand(0).getOpcode() != ExOp)
+ return false;
+
+ SDValue ExVec = InScl.getOperand(0).getOperand(0);
+ uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
+ assert(ExIdx < NumElts && "Illegal extraction index");
+ Ops.push_back(InVec);
+ Ops.push_back(ExVec);
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
+ return true;
+ }
+ case X86ISD::PACKSS: {
+ // If we know input saturation won't happen we can treat this
+ // as a truncation shuffle.
+ if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
+ DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
+ return false;
+
Ops.push_back(N.getOperand(0));
+ Ops.push_back(N.getOperand(1));
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i * 2);
return true;
}
case X86ISD::VSHLI:
@@ -5795,6 +6075,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
}
return true;
}
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case X86ISD::VZEXT: {
// TODO - add support for VPMOVZX with smaller input vector types.
SDValue Src = N.getOperand(0);
@@ -5810,36 +6091,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return false;
}
+/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask) {
+ int MaskWidth = Mask.size();
+ SmallVector<SDValue, 16> UsedInputs;
+ for (int i = 0, e = Inputs.size(); i < e; ++i) {
+ int lo = UsedInputs.size() * MaskWidth;
+ int hi = lo + MaskWidth;
+ if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+ UsedInputs.push_back(Inputs[i]);
+ continue;
+ }
+ for (int &M : Mask)
+ if (lo <= M)
+ M -= MaskWidth;
+ }
+ Inputs = UsedInputs;
+}
+
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
-/// Op0/Op1 inputs accordingly.
+/// inputs accordingly.
/// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
- SmallVectorImpl<int> &Mask) {
- SmallVector<SDValue, 2> Ops;
- if (!setTargetShuffleZeroElements(Op, Mask, Ops))
- if (!getFauxShuffleMask(Op, Mask, Ops))
+static bool resolveTargetShuffleInputs(SDValue Op,
+ SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ SelectionDAG &DAG) {
+ if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
+ if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
return false;
- int NumElts = Mask.size();
- bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
- return 0 <= Idx && Idx < NumElts;
- });
- bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
-
- Op0 = Op0InUse ? Ops[0] : SDValue();
- Op1 = Op1InUse ? Ops[1] : SDValue();
-
- // We're only using Op1 - commute the mask and inputs.
- if (!Op0InUse && Op1InUse) {
- for (int &M : Mask)
- if (NumElts <= M)
- M -= NumElts;
- Op0 = Op1;
- Op1 = SDValue();
- }
-
+ resolveTargetShuffleInputsAndMask(Inputs, Mask);
return true;
}
@@ -5914,11 +6198,10 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
- unsigned NumNonZero, unsigned NumZero,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (NumNonZero > 8)
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (NumNonZero > 8 && !Subtarget.hasSSE41())
return SDValue();
SDLoc dl(Op);
@@ -5928,18 +6211,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
// SSE4.1 - use PINSRB to insert each byte directly.
if (Subtarget.hasSSE41()) {
for (unsigned i = 0; i < 16; ++i) {
- bool isNonZero = (NonZeros & (1 << i)) != 0;
- if (isNonZero) {
+ bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v16i8);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v16i8, V);
+ continue;
+ }
}
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
- MVT::v16i8, V, Op.getOperand(i),
- DAG.getIntPtrConstant(i, dl));
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
}
@@ -5958,24 +6249,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
}
if ((i & 1) != 0) {
+ // FIXME: Investigate extending to i32 instead of just i16.
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
if (LastIsNonZero) {
- LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
- MVT::i16, Op.getOperand(i-1));
+ LastElt =
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
}
if (ThisIsNonZero) {
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
- ThisElt, DAG.getConstant(8, dl, MVT::i8));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
+ DAG.getConstant(8, dl, MVT::i8));
if (LastIsNonZero)
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
} else
ThisElt = LastElt;
- if (ThisElt.getNode())
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i/2, dl));
+ if (ThisElt) {
+ if (1 == i) {
+ V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+ : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ } else {
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i / 2, dl));
+ }
+ }
}
}
@@ -5986,27 +6288,34 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (NumNonZero > 4)
+ const X86Subtarget &Subtarget) {
+ if (NumNonZero > 4 && !Subtarget.hasSSE41())
return SDValue();
SDLoc dl(Op);
SDValue V;
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
- bool isNonZero = (NonZeros & (1 << i)) != 0;
- if (isNonZero) {
+ bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ if (IsNonZero) {
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
if (First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
- else
- V = DAG.getUNDEF(MVT::v8i16);
First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
+ }
}
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
- MVT::v8i16, V, Op.getOperand(i),
- DAG.getIntPtrConstant(i, dl));
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
}
@@ -6015,8 +6324,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
/// Custom lower build_vector of v4i32 or v4f32.
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
+ const X86Subtarget &Subtarget) {
// Find all zeroable elements.
std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
@@ -6064,7 +6372,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
Elt = Op->getOperand(EltIdx);
// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
- EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+ EltMaskIdx = Elt.getConstantOperandVal(1);
if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
break;
Mask[EltIdx] = EltIdx;
@@ -6095,8 +6403,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
SDValue SrcVector = Current->getOperand(0);
if (!V1.getNode())
V1 = SrcVector;
- CanFold = SrcVector == V1 &&
- cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+ CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
}
if (!CanFold)
@@ -6212,7 +6519,8 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
///
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
- SDLoc &DL, SelectionDAG &DAG,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
bool isAfterLegalize) {
unsigned NumElems = Elts.size();
@@ -6288,16 +6596,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
-
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain =
- DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(NewLd.getNode(), 1));
- }
-
+ DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
return NewLd;
};
@@ -6317,6 +6616,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
+ // Don't create 256-bit non-temporal aligned loads without AVX2 as these
+ // will lower to regular temporal loads and use the cache.
+ if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+ VT.is256BitVector() && !Subtarget.hasInt256())
+ return SDValue();
+
if (IsConsecutiveLoad)
return CreateLoad(VT, LDBase);
@@ -6356,19 +6661,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
false/*WriteMem*/);
-
- // Make sure the newly-created LOAD is in the same position as LDBase in
- // terms of dependency. We create a TokenFactor for LDBase and ResNode,
- // and update uses of LDBase's output chain to use the TokenFactor.
- if (LDBase->hasAnyUseOfValue(1)) {
- SDValue NewChain =
- DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
- SDValue(ResNode.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
- SDValue(ResNode.getNode(), 1));
- }
-
+ DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
@@ -6376,22 +6669,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
return SDValue();
}
-static Constant *getConstantVector(MVT VT, APInt SplatValue,
+static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
unsigned SplatBitSize, LLVMContext &C) {
unsigned ScalarSize = VT.getScalarSizeInBits();
unsigned NumElm = SplatBitSize / ScalarSize;
SmallVector<Constant *, 32> ConstantVec;
for (unsigned i = 0; i < NumElm; i++) {
- APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+ APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
Constant *Const;
if (VT.isFloatingPoint()) {
- assert((ScalarSize == 32 || ScalarSize == 64) &&
- "Unsupported floating point scalar size");
- if (ScalarSize == 32)
- Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
- else
- Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+ if (ScalarSize == 32) {
+ Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
+ } else {
+ assert(ScalarSize == 64 && "Unsupported floating point scalar size");
+ Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
+ }
} else
Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
ConstantVec.push_back(Const);
@@ -6409,18 +6702,16 @@ static bool isUseOfShuffle(SDNode *N) {
return false;
}
-/// Attempt to use the vbroadcast instruction to generate a splat value for the
-/// following cases:
-/// 1. A splat BUILD_VECTOR which uses:
-/// a. A single scalar load, or a constant.
-/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
-/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
-/// a scalar load, or a constant.
+/// Attempt to use the vbroadcast instruction to generate a splat value
+/// from a splat BUILD_VECTOR which uses:
+/// a. A single scalar load, or a constant.
+/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
///
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
-static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
// VBROADCAST requires AVX.
// TODO: Splats could be generated for non-AVX CPUs using SSE
// instructions, but there's less potential gain for only 128-bit vectors.
@@ -6479,11 +6770,13 @@ static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget
// AVX have support for 32 and 64 bit broadcast for floats only.
// No 64bit integer in 32bit subtarget.
MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
- Constant *C = SplatBitSize == 32
- ? ConstantFP::get(Type::getFloatTy(*Ctx),
- SplatValue.bitsToFloat())
- : ConstantFP::get(Type::getDoubleTy(*Ctx),
- SplatValue.bitsToDouble());
+ // Lower the splat via APFloat directly, to avoid any conversion.
+ Constant *C =
+ SplatBitSize == 32
+ ? ConstantFP::get(*Ctx,
+ APFloat(APFloat::IEEEsingle(), SplatValue))
+ : ConstantFP::get(*Ctx,
+ APFloat(APFloat::IEEEdouble(), SplatValue));
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
@@ -6664,6 +6957,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
// Quit if non-constant index.
if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
@@ -6694,11 +6988,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
- for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
- unsigned Idx = InsertIndices[i];
+
+ for (unsigned Idx : InsertIndices)
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
DAG.getIntPtrConstant(Idx, DL));
- }
return NV;
}
@@ -6711,7 +7004,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (!In.isUndef())
- Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
}
SDLoc dl(Op);
MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
@@ -6754,7 +7047,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(In))
NonConstIdx.push_back(idx);
else {
- Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
HasConstElts = true;
}
if (SplatIdx < 0)
@@ -6765,9 +7058,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat)
- return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
- DAG.getConstant(1, dl, VT),
- DAG.getConstant(0, dl, VT));
+ return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
// insert elements one by one
SDValue DstVec;
@@ -7347,7 +7640,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
(VT == MVT::v8i32 && Subtarget.hasInt256()))
return Op;
- return getOnesVector(VT, Subtarget, DAG, DL);
+ return getOnesVector(VT, DAG, DL);
}
return SDValue();
@@ -7373,7 +7666,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
- if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
+ if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
return BitOp;
@@ -7418,7 +7711,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// a constant pool load than it is to do a movd + shuffle.
if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
(!IsAllConstants || Idx == 0)) {
- if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
MVT VecVT = MVT::v4i32;
@@ -7523,7 +7816,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
- if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+ if (SDValue LD =
+ EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
}
@@ -7561,17 +7855,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16)
if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
- DAG, Subtarget, *this))
+ DAG, Subtarget))
return V;
if (EVTBits == 16 && NumElems == 8)
if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
- DAG, Subtarget, *this))
+ DAG, Subtarget))
return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4)
- if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
return V;
// If element VT is == 32 bits, turn it into a number of shuffles.
@@ -7647,24 +7941,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
// Next, we iteratively mix elements, e.g. for v4f32:
- // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
- // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
- // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
- unsigned EltStride = NumElems >> 1;
- while (EltStride != 0) {
- for (unsigned i = 0; i < EltStride; ++i) {
- // If Ops[i+EltStride] is undef and this is the first round of mixing,
- // then it is safe to just drop this shuffle: V[i] is already in the
- // right place, the one element (since it's the first round) being
- // inserted as undef can be dropped. This isn't safe for successive
- // rounds because they will permute elements within both vectors.
- if (Ops[i+EltStride].isUndef() &&
- EltStride == NumElems/2)
- continue;
-
- Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
- }
- EltStride >>= 1;
+ // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
+ // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
+ // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
+ for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
+ // Generate scaled UNPCKL shuffle mask.
+ SmallVector<int, 16> Mask;
+ for(unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(i);
+ for (unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(NumElems+i);
+ Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
+
+ for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
+ Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
}
return Ops[0];
}
@@ -7699,6 +7989,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
+// Return true if all the operands of the given CONCAT_VECTORS node are zeros
+// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
+static bool isExpandWithZeros(const SDValue &Op) {
+ assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
+ "Expand with zeros only possible in CONCAT_VECTORS nodes!");
+
+ for (unsigned i = 1; i < Op.getNumOperands(); i++)
+ if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
+ return false;
+
+ return true;
+}
+
+// Returns true if the given node is a type promotion (by concatenating i1
+// zeros) of the result of a node that already zeros all upper bits of
+// k-register.
+static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
+ unsigned Opc = Op.getOpcode();
+
+ assert(Opc == ISD::CONCAT_VECTORS &&
+ Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected node to check for type promotion!");
+
+ // As long as we are concatenating zeros to the upper part of a previous node
+ // result, climb up the tree until a node with different opcode is
+ // encountered
+ while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
+ if (Opc == ISD::INSERT_SUBVECTOR) {
+ if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
+ Op.getConstantOperandVal(2) == 0)
+ Op = Op.getOperand(1);
+ else
+ return SDValue();
+ } else { // Opc == ISD::CONCAT_VECTORS
+ if (isExpandWithZeros(Op))
+ Op = Op.getOperand(0);
+ else
+ return SDValue();
+ }
+ Opc = Op.getOpcode();
+ }
+
+ // Check if the first inserted node zeroes the upper bits, or an 'and' result
+ // of a node that zeros the upper bits (its masked version).
+ if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
+ (Op.getOpcode() == ISD::AND &&
+ (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
+ isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
+ return Op;
+ }
+
+ return SDValue();
+}
+
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
const X86Subtarget &Subtarget,
SelectionDAG & DAG) {
@@ -7709,6 +8053,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(isPowerOf2_32(NumOfOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
+ // If this node promotes - by concatenating zeroes - the type of the result
+ // of a node with instruction that zeroes all upper (irrelevant) bits of the
+ // output register, mark it as legal and catch the pattern in instruction
+ // selection to avoid emitting extra insturctions (for zeroing upper bits).
+ if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
+ SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
+ SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
+ ZeroC);
+ }
+
SDValue Undef = DAG.getUNDEF(ResVT);
if (NumOfOperands > 2) {
// Specialize the cases when all, or all but one, of the operands are undef.
@@ -7767,7 +8122,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
if (V1.isUndef())
- V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
if (IsZeroV1)
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
@@ -7849,7 +8204,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
ArrayRef<int> Mask,
SmallVectorImpl<int> &RepeatedMask) {
- int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
RepeatedMask.assign(LaneSize, -1);
int Size = Mask.size();
for (int i = 0; i < Size; ++i) {
@@ -7956,7 +8311,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
ExpectedBV->getOperand(ExpectedMask[i] % Size))
return false;
}
-}
+ }
return true;
}
@@ -7986,6 +8341,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
return true;
}
+// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
+// mask.
+static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
+ const APInt &Zeroable) {
+ int NumElts = Mask.size();
+ assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+
+ SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
+ TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+ }
+ return TargetMask;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return false;
+
+ SmallVector<int, 8> Unpcklwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+ /* Unary = */ false);
+ SmallVector<int, 8> Unpckhwd;
+ createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+ /* Unary = */ false);
+ bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
+ isTargetShuffleEquivalent(Mask, Unpckhwd));
+ return IsUnpackwdMask;
+}
+
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -8009,7 +8399,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
return Imm;
}
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
@@ -8022,9 +8412,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
/// as many lanes with this technique as possible to simplify the remaining
/// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
- SDValue V1, SDValue V2) {
- SmallBitVector Zeroable(Mask.size(), false);
+static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ APInt Zeroable(Mask.size(), 0);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
@@ -8039,7 +8429,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
int M = Mask[i];
// Handle the easy cases.
if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
- Zeroable[i] = true;
+ Zeroable.setBit(i);
continue;
}
@@ -8057,17 +8447,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
int Scale = Size / V->getNumOperands();
SDValue Op = V.getOperand(M / Scale);
if (Op.isUndef() || X86::isZeroNode(Op))
- Zeroable[i] = true;
+ Zeroable.setBit(i);
else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
APInt Val = Cst->getAPIntValue();
- Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
- Zeroable[i] = (Val == 0);
+ if (Val == 0)
+ Zeroable.setBit(i);
} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
APInt Val = Cst->getValueAPF().bitcastToAPInt();
- Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+ Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
Val = Val.getLoBits(ScalarSizeInBits);
- Zeroable[i] = (Val == 0);
+ if (Val == 0)
+ Zeroable.setBit(i);
}
continue;
}
@@ -8081,7 +8473,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue Op = V.getOperand((M * Scale) + j);
AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
}
- Zeroable[i] = AllZeroable;
+ if (AllZeroable)
+ Zeroable.setBit(i);
continue;
}
}
@@ -8096,19 +8489,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
//
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
- ArrayRef<int> Mask,const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
+ ArrayRef<int> Mask, const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
- for (int i = 0, e = Zeroable.size(); i < e; i++) {
+ for (int i = 0, e = Mask.size(); i < e; i++) {
// Checks if the mask's zeros elements are built from only zeros.
- if (Mask[i] == -1)
+ assert(Mask[i] >= -1 && "Out of bound mask element!");
+ if (Mask[i] < 0)
return false;
if (Zeroable[i])
continue;
// Find the lowest non zero element
- if (NextElement == -1) {
+ if (NextElement < 0) {
NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
IsZeroSideLeft = NextElement != 0;
}
@@ -8124,7 +8518,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
@@ -8179,19 +8573,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl);
-// Function convertBitVectorToUnsigned - The function gets SmallBitVector
-// as argument and convert him to unsigned.
-// The output of the function is not(zeroable)
-static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
- unsigned convertBit = 0;
- for (int i = 0, e = Zeroable.size(); i < e; i++)
- convertBit |= !(Zeroable[i]) << i;
- return convertBit;
-}
-
// X86 has dedicated shuffle that can be lowered to VEXPAND
static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
ArrayRef<int> Mask, SDValue &V1,
SDValue &V2, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8199,7 +8583,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
IsLeftZeroSide))
return SDValue();
- unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+ unsigned VEXPANDMask = (~Zeroable).getZExtValue();
MVT IntegerType =
MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
@@ -8210,9 +8594,94 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
Subtarget, DAG, DL);
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
- return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
- DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
- ZeroVector);
+ return DAG.getSelect(DL, VT, VMask,
+ DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+ ZeroVector);
+}
+
+static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &UnpackOpcode, bool IsUnary,
+ ArrayRef<int> TargetMask, SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+
+ bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+ for (int i = 0; i != NumElts; i += 2) {
+ int M1 = TargetMask[i + 0];
+ int M2 = TargetMask[i + 1];
+ Undef1 &= (SM_SentinelUndef == M1);
+ Undef2 &= (SM_SentinelUndef == M2);
+ Zero1 &= isUndefOrZero(M1);
+ Zero2 &= isUndefOrZero(M2);
+ }
+ assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+ "Zeroable shuffle detected");
+
+ // Attempt to match the target mask against the unpack lo/hi mask patterns.
+ SmallVector<int, 64> Unpckl, Unpckh;
+ createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+ V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+ return true;
+ }
+
+ // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+ if (IsUnary && (Zero1 || Zero2)) {
+ // Don't bother if we can blend instead.
+ if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+ isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+ return false;
+
+ bool MatchLo = true, MatchHi = true;
+ for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+ int M = TargetMask[i];
+
+ // Ignore if the input is known to be zero or the index is undef.
+ if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+ (M == SM_SentinelUndef))
+ continue;
+
+ MatchLo &= (M == Unpckl[i]);
+ MatchHi &= (M == Unpckh[i]);
+ }
+
+ if (MatchLo || MatchHi) {
+ UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+ return true;
+ }
+ }
+
+ // If a binary shuffle, commute and try again.
+ if (!IsUnary) {
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+ UnpackOpcode = X86ISD::UNPCKL;
+ std::swap(V1, V2);
+ return true;
+ }
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+ UnpackOpcode = X86ISD::UNPCKH;
+ std::swap(V1, V2);
+ return true;
+ }
+ }
+
+ return false;
}
// X86 has dedicated unpack instructions that can handle specific blend
@@ -8248,13 +8717,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
/// one of the inputs being zeroable.
static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() && "Floating point types are not supported");
MVT EltVT = VT.getVectorElementType();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -8286,10 +8754,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
SelectionDAG &DAG) {
assert(VT.isInteger() && "Only supports integer vector types!");
MVT EltVT = VT.getVectorElementType();
- int NumEltBits = EltVT.getSizeInBits();
SDValue Zero = DAG.getConstant(0, DL, EltVT);
- SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
- EltVT);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
SmallVector<SDValue, 16> MaskOps;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
@@ -8307,51 +8773,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
-/// \brief Try to emit a blend instruction for a shuffle.
-///
-/// This doesn't do any checks for the availability of instructions for blending
-/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
-/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Original,
- const SmallBitVector &Zeroable,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
- SmallVector<int, 8> Mask(Original.begin(), Original.end());
- bool ForceV1Zero = false, ForceV2Zero = false;
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
+static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
+ MutableArrayRef<int> TargetMask,
+ bool &ForceV1Zero, bool &ForceV2Zero,
+ uint64_t &BlendMask) {
+ bool V1IsZeroOrUndef =
+ V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZeroOrUndef =
+ V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+ BlendMask = 0;
+ ForceV1Zero = false, ForceV2Zero = false;
+ assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
- unsigned BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- int M = Mask[i];
- if (M < 0)
+ for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+ int M = TargetMask[i];
+ if (M == SM_SentinelUndef)
continue;
if (M == i)
continue;
if (M == i + Size) {
- BlendMask |= 1u << i;
+ BlendMask |= 1ull << i;
continue;
}
- if (Zeroable[i]) {
- if (V1IsZero) {
+ if (M == SM_SentinelZero) {
+ if (V1IsZeroOrUndef) {
ForceV1Zero = true;
- Mask[i] = i;
+ TargetMask[i] = i;
continue;
}
- if (V2IsZero) {
+ if (V2IsZeroOrUndef) {
ForceV2Zero = true;
- BlendMask |= 1u << i;
- Mask[i] = i + Size;
+ BlendMask |= 1ull << i;
+ TargetMask[i] = i + Size;
continue;
}
}
- return SDValue(); // Shuffled input!
+ return false;
}
+ return true;
+}
+
+uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+ uint64_t ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1ull << i))
+ ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+ return ScaledMask;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Original,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
+
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+ BlendMask))
+ return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
if (ForceV1Zero)
@@ -8359,15 +8855,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (ForceV2Zero)
V2 = getZeroVector(VT, Subtarget, DAG, DL);
- auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
- unsigned ScaledMask = 0;
- for (int i = 0; i != Size; ++i)
- if (BlendMask & (1u << i))
- for (int j = 0; j != Scale; ++j)
- ScaledMask |= 1u << (i * Scale + j);
- return ScaledMask;
- };
-
switch (VT.SimpleTy) {
case MVT::v2f64:
case MVT::v4f32:
@@ -8387,7 +8874,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (Subtarget.hasAVX2()) {
// Scale the blend by the number of 32-bit dwords per element.
int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
@@ -8400,7 +8887,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
int Scale = 8 / VT.getVectorNumElements();
- BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
V1 = DAG.getBitcast(MVT::v8i16, V1);
V2 = DAG.getBitcast(MVT::v8i16, V2);
return DAG.getBitcast(VT,
@@ -8417,7 +8904,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
BlendMask = 0;
for (int i = 0; i < 8; ++i)
if (RepeatedMask[i] >= 8)
- BlendMask |= 1u << i;
+ BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
DAG.getConstant(BlendMask, DL, MVT::i8));
}
@@ -8428,6 +8915,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
+ if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+ }
+
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
if (SDValue Masked =
lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
@@ -8462,10 +8956,21 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
- DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+ VT,
+ DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+ V1, V2));
+ }
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v8i64:
+ case MVT::v16i32:
+ case MVT::v32i16:
+ case MVT::v64i8: {
+ MVT IntegerType =
+ MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+ SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+ return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
-
default:
llvm_unreachable("Not a supported integer vector type!");
}
@@ -8503,7 +9008,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
}
-/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// \brief Generic routine to decompose a shuffle and blend into independent
/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
@@ -8757,7 +9262,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
unsigned ScalarSizeInBits,
ArrayRef<int> Mask, int MaskOffset,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget) {
int Size = Mask.size();
unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -8819,7 +9324,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Size = Mask.size();
@@ -8852,132 +9357,145 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getBitcast(VT, V);
}
-/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
- SelectionDAG &DAG) {
+// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+// Remainder of lower half result is zero and upper half is all undef.
+static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
- assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+ assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
// Upper half must be undefined.
if (!isUndefInRange(Mask, HalfSize, HalfSize))
- return SDValue();
+ return false;
- // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
- // Remainder of lower half result is zero and upper half is all undef.
- auto LowerAsEXTRQ = [&]() {
- // Determine the extraction length from the part of the
- // lower half that isn't zeroable.
- int Len = HalfSize;
- for (; Len > 0; --Len)
- if (!Zeroable[Len - 1])
- break;
- assert(Len > 0 && "Zeroable shuffle mask");
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len > 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
- // Attempt to match first Len sequential elements from the lower half.
- SDValue Src;
- int Idx = -1;
- for (int i = 0; i != Len; ++i) {
- int M = Mask[i];
- if (M < 0)
- continue;
- SDValue &V = (M < Size ? V1 : V2);
- M = M % Size;
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
- // The extracted elements must start at a valid index and all mask
- // elements must be in the lower half.
- if (i > M || M >= HalfSize)
- return SDValue();
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
+ return false;
- if (Idx < 0 || (Src == V && Idx == (M - i))) {
- Src = V;
- Idx = M - i;
- continue;
- }
- return SDValue();
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
}
+ return false;
+ }
- if (Idx < 0)
- return SDValue();
+ if (!Src || Idx < 0)
+ return false;
- assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
- int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
- int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
- return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
- };
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Src;
+ return true;
+}
+
+// INSERTQ: Extract lowest Len elements from lower half of second source and
+// insert over first source, starting at Idx.
+// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
- if (SDValue ExtrQ = LowerAsEXTRQ())
- return ExtrQ;
+ // Upper half must be undefined.
+ if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ return false;
+
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
- // INSERTQ: Extract lowest Len elements from lower half of second source and
- // insert over first source, starting at Idx.
- // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
- auto LowerAsInsertQ = [&]() {
- for (int Idx = 0; Idx != HalfSize; ++Idx) {
- SDValue Base;
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }
- // Attempt to match first source from mask before insertion point.
- if (isUndefInRange(Mask, 0, Idx)) {
+ // Extend the extraction length looking to match both the insertion of
+ // the second source and the remaining elements of the first.
+ for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+ SDValue Insert;
+ int Len = Hi - Idx;
+
+ // Match insertion.
+ if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+ Insert = V1;
+ } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+ Insert = V2;
+ } else {
+ continue;
+ }
+
+ // Match the remaining elements of the lower half.
+ if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
/* EMPTY */
- } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ } else if ((!Base || (Base == V1)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
Base = V1;
- } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ } else if ((!Base || (Base == V2)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+ Size + Hi)) {
Base = V2;
} else {
continue;
}
- // Extend the extraction length looking to match both the insertion of
- // the second source and the remaining elements of the first.
- for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
- SDValue Insert;
- int Len = Hi - Idx;
-
- // Match insertion.
- if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
- Insert = V1;
- } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
- Insert = V2;
- } else {
- continue;
- }
-
- // Match the remaining elements of the lower half.
- if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
- /* EMPTY */
- } else if ((!Base || (Base == V1)) &&
- isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
- Base = V1;
- } else if ((!Base || (Base == V2)) &&
- isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
- Size + Hi)) {
- Base = V2;
- } else {
- continue;
- }
-
- // We may not have a base (first source) - this can safely be undefined.
- if (!Base)
- Base = DAG.getUNDEF(VT);
-
- int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
- int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
- return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
- }
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Base;
+ V2 = Insert;
+ return true;
}
+ }
- return SDValue();
- };
+ return false;
+}
- if (SDValue InsertQ = LowerAsInsertQ())
- return InsertQ;
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ uint64_t BitLen, BitIdx;
+ if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+
+ if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
+ V2 ? V2 : DAG.getUNDEF(VT),
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
return SDValue();
}
@@ -8987,7 +9505,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget. The extended elements are consecutive and
-/// begin and can start from an offseted element index in the input; to
+/// begin and can start from an offsetted element index in the input; to
/// avoid excess shuffling the offset must either being in the bottom lane
/// or at the start of a higher lane. All extended elements must be from
/// the same lane.
@@ -9027,21 +9545,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget.hasSSE41()) {
- // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
if (Offset && Scale == 2 && VT.is128BitVector())
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
-
- // For 256-bit vectors, we only need the lower (128-bit) input half.
- // For 512-bit vectors, we only need the lower input half or quarter.
- if (VT.getSizeInBits() > 128)
- InputV = extractSubVector(InputV, 0, DAG, DL,
- std::max(128, (int)VT.getSizeInBits() / Scale));
-
- InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+ InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -9158,7 +9669,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
@@ -9314,7 +9825,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+ const APInt &Zeroable, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
@@ -9469,7 +9980,6 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
@@ -9582,17 +10092,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
-
- // Make sure the newly-created LOAD is in the same position as Ld in
- // terms of dependency. We create a TokenFactor for Ld and V,
- // and update uses of Ld's output chain to use the TokenFactor.
- if (Ld->hasAnyUseOfValue(1)) {
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- SDValue(Ld, 1), SDValue(V.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
- DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
- SDValue(V.getNode(), 1));
- }
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
} else if (!BroadcastFromReg) {
// We can't broadcast from a vector register.
return SDValue();
@@ -9612,7 +10112,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
if (((BroadcastIdx * EltSize) % 128) != 0)
return SDValue();
- MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+ // The shuffle input might have been a bitcast we looked through; look at
+ // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
+ // later bitcast it to BroadcastVT.
+ MVT SrcVT = V.getSimpleValueType();
+ assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ "Unexpected vector element size");
+ assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+ "Unexpected vector size");
+
+ MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
DAG.getIntPtrConstant(BroadcastIdx, DL));
}
@@ -9642,6 +10151,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
}
+ // We only support broadcasting from 128-bit vectors to minimize the
+ // number of patterns we need to deal with in isel. So extract down to
+ // 128-bits.
+ if (SrcVT.getSizeInBits() > 128)
+ V = extract128BitVector(V, 0, DAG, DL);
+
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
@@ -9653,7 +10168,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// elements are zeroable.
static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
unsigned &InsertPSMask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
@@ -9742,7 +10257,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SelectionDAG &DAG) {
assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
@@ -9877,7 +10392,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
/// it is better to avoid lowering through this for integer vectors where
/// possible.
static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -9959,7 +10474,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// it falls back to the floating point shuffle operation with appropriate bit
/// casting.
static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10178,7 +10693,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
/// domain crossing penalties, as these are sufficient to implement all v4f32
/// shuffles.
static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10261,7 +10776,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// We try to handle these with integer-domain shuffles where we can, but for
/// blends we use the floating point domain blend instructions.
static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -10353,7 +10868,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
- // up the inputs, bypassing domain shift penalties that we would encur if we
+ // up the inputs, bypassing domain shift penalties that we would incur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
@@ -10384,18 +10899,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
- assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+ assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
SmallVector<int, 4> LoInputs;
- std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
- [](int M) { return M >= 0; });
+ copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
std::sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
SmallVector<int, 4> HiInputs;
- std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
- [](int M) { return M >= 0; });
+ copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
std::sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
int NumLToL =
@@ -10530,9 +11043,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
"We need to be changing the number of flipped inputs!");
int PSHUFHalfMask[] = {0, 1, 2, 3};
std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
- V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
- MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+ V = DAG.getNode(
+ FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+ MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
if (M >= 0 && M == FixIdx)
@@ -10574,7 +11088,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
- else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+ if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
// At this point there are at most two inputs to the low and high halves from
@@ -10830,7 +11344,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
/// blend if only one input is used.
static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+ const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
bool &V2InUse) {
SDValue V1Mask[16];
SDValue V2Mask[16];
@@ -10891,7 +11405,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
/// halves of the inputs separately (making them have relatively few inputs)
/// and then concatenate them.
static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11075,7 +11589,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
/// back together.
static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11132,14 +11646,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (!canWidenViaDuplication(Mask))
return SDValue();
SmallVector<int, 4> LoInputs;
- std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
- [](int M) { return M >= 0 && M < 8; });
+ copy_if(Mask, std::back_inserter(LoInputs),
+ [](int M) { return M >= 0 && M < 8; });
std::sort(LoInputs.begin(), LoInputs.end());
LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
LoInputs.end());
SmallVector<int, 4> HiInputs;
- std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
- [](int M) { return M >= 8; });
+ copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
std::sort(HiInputs.begin(), HiInputs.end());
HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
HiInputs.end());
@@ -11193,7 +11706,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
PostDupI16Shuffle[i / 2] = MappedMask;
else
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
- "Conflicting entrties in the original shuffle!");
+ "Conflicting entries in the original shuffle!");
}
return DAG.getBitcast(
MVT::v16i8,
@@ -11365,7 +11878,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// dispatches to the lowering routines accordingly.
static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
switch (VT.SimpleTy) {
@@ -11621,7 +12134,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
/// \brief Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SmallVector<int, 4> WidenedMask;
@@ -11647,18 +12160,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+ // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0, DL));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
- DAG.getIntPtrConstant(0, DL));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
+ // this will likely become vinsertf128 which can't fold a 256-bit memop.
+ if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
}
}
@@ -12091,7 +12608,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
unsigned &ShuffleImm,
ArrayRef<int> Mask) {
int NumElts = VT.getVectorNumElements();
- assert(VT.getScalarType() == MVT::f64 &&
+ assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
@@ -12127,6 +12644,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
+ assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+ "Unexpected data type for VSHUFPD");
+
unsigned Immediate = 0;
if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
return SDValue();
@@ -12153,7 +12673,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12250,7 +12770,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12338,7 +12858,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
/// isn't available.
static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12414,6 +12934,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code using vpunpcklwd and
+ // vpunpckhwd instrs than vblend.
+ if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+ if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG))
+ return V;
+
// If we have AVX2 then we always want to lower with a blend because at v8 we
// can fully permute the elements.
if (Subtarget.hasAVX2())
@@ -12429,7 +12957,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v8i32 shuffling..
static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12445,6 +12973,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
return ZExt;
+ // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+ // since after split we get a more efficient code than vblend by using
+ // vpunpcklwd and vpunpckhwd instrs.
+ if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+ !Subtarget.hasAVX512())
+ if (SDValue V =
+ lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return V;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
@@ -12533,7 +13070,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v16i16 shuffling..
static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12619,7 +13156,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine is only called when we have AVX2 and thus a reasonable
/// instruction set for v32i8 shuffling..
static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12692,7 +13229,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// If we have a single input to the zero element, insert that into V1 if we
@@ -12844,7 +13381,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12891,12 +13428,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12925,6 +13466,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
return Unpck;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
@@ -12938,7 +13483,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -12994,12 +13539,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13062,12 +13611,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
V1, V2, DAG, Subtarget))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13109,12 +13661,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
SDValue V1, SDValue V2,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -13159,6 +13715,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
return V;
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Blend;
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -13170,7 +13730,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// together based on the available instructions.
static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT VT, SDValue V1, SDValue V2,
- const SmallBitVector &Zeroable,
+ const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
@@ -13251,7 +13811,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (ISD::isBuildVectorAllZeros(V1.getNode()))
V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
else if (ISD::isBuildVectorAllOnes(V1.getNode()))
- V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ V1 = getOnesVector(ExtVT, DAG, DL);
else
V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
@@ -13260,7 +13820,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
else if (ISD::isBuildVectorAllZeros(V2.getNode()))
V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
else if (ISD::isBuildVectorAllOnes(V2.getNode()))
- V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+ V2 = getOnesVector(ExtVT, DAG, DL);
else
V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
@@ -13392,8 +13952,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- if (Zeroable.all())
+ APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
// Try to collapse shuffles into using a vector type with fewer elements but
@@ -13474,6 +14034,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
+ // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+ // with patterns on the mask registers on AVX-512.
+ if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+ return Op;
+
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13483,10 +14048,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget.hasSSE41())
return SDValue();
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+ // into an i1 condition so that we can use the mask-based 512-bit blend
+ // instructions.
+ if (VT.getSizeInBits() == 512) {
+ SDValue Cond = Op.getOperand(0);
+ // The vNi1 condition case should be handled above as it can be trivially
+ // lowered.
+ assert(Cond.getValueType().getScalarSizeInBits() ==
+ VT.getScalarSizeInBits() &&
+ "Should have a size-matched integer condition!");
+ // Build a mask by testing the condition against itself (tests for zero).
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+ // Now return a new VSELECT using the mask.
+ return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+ }
+
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.
- switch (Op.getSimpleValueType().SimpleTy) {
+ switch (VT.SimpleTy) {
default:
// Most of the vector types have blends past SSE4.1.
return Op;
@@ -13564,15 +14149,18 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
SDValue Idx = Op.getOperand(1);
MVT EltVT = Op.getSimpleValueType();
- assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
"Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
- // extend vector to VR512
+ // extend vector to VR512/128
if (!isa<ConstantSDNode>(Idx)) {
- MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
- SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+ unsigned NumElts = VecVT.getVectorNumElements();
+ // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+ // than extending to 128/256bit.
+ unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
ExtVT.getVectorElementType(), Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
@@ -13590,12 +14178,12 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
}
unsigned MaxSift = VecVT.getVectorNumElements() - 1;
if (MaxSift - IdxVal)
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(MaxSift, dl, MVT::i8));
- return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
- DAG.getIntPtrConstant(0, dl));
+ return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
+ DAG.getIntPtrConstant(0, dl));
}
SDValue
@@ -13606,28 +14194,40 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
- if (Op.getSimpleValueType() == MVT::i1)
+ if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG);
if (!isa<ConstantSDNode>(Idx)) {
- if (VecVT.is512BitVector() ||
- (VecVT.is256BitVector() && Subtarget.hasInt256() &&
- VecVT.getScalarSizeInBits() == 32)) {
-
- MVT MaskEltVT =
- MVT::getIntegerVT(VecVT.getScalarSizeInBits());
- MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
- MaskEltVT.getSizeInBits());
+ // Its more profitable to go through memory (1 cycles throughput)
+ // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+ // IACA tool was used to get performance estimation
+ // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+ //
+ // example : extractelement <16 x i8> %a, i32 %i
+ //
+ // Block Throughput: 3.00 Cycles
+ // Throughput Bottleneck: Port5
+ //
+ // | Num Of | Ports pressure in cycles | |
+ // | Uops | 0 - DV | 5 | 6 | 7 | |
+ // ---------------------------------------------
+ // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
+ // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
+ // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
+ // Total Num Of Uops: 4
+ //
+ //
+ // Block Throughput: 1.00 Cycles
+ // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+ //
+ // | | Ports pressure in cycles | |
+ // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
+ // ---------------------------------------------------------
+ // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+ // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
+ // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
+ // Total Num Of Uops: 4
- Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
- auto PtrVT = getPointerTy(DAG.getDataLayout());
- SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
- getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
- DAG.getConstant(0, dl, PtrVT));
- SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
- DAG.getConstant(0, dl, PtrVT));
- }
return SDValue();
}
@@ -13675,7 +14275,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
return Res;
- // TODO: handle v16i8.
+ // TODO: We only extract a single element from v16i8, we can probably afford
+ // to be more aggressive here before using the default approach of spilling to
+ // stack.
+ if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+ // Extract either the lowest i32 or any i16, and extract the sub-byte.
+ int DWordIdx = IdxVal / 4;
+ if (DWordIdx == 0) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec),
+ DAG.getIntPtrConstant(DWordIdx, dl));
+ int ShiftVal = (IdxVal % 4) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i32));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ int WordIdx = IdxVal / 2;
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ DAG.getBitcast(MVT::v8i16, Vec),
+ DAG.getIntPtrConstant(WordIdx, dl));
+ int ShiftVal = (IdxVal % 2) * 8;
+ if (ShiftVal != 0)
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+ DAG.getConstant(ShiftVal, dl, MVT::i16));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
if (VT.getSizeInBits() == 32) {
if (IdxVal == 0)
@@ -13734,31 +14360,35 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
if(Vec.isUndef()) {
if (IdxVal)
- EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return EltInVec;
}
- // Insertion of one bit into first or last position
- // can be done with two SHIFTs + OR.
+ // Insertion of one bit into first position
if (IdxVal == 0 ) {
- // EltInVec already at correct index and other bits are 0.
+ // Clean top bits of vector.
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
+ DAG.getConstant(NumElems - 1, dl, MVT::i8));
+ EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
+ DAG.getConstant(NumElems - 1, dl, MVT::i8));
// Clean the first bit in source vector.
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(1 , dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(1, dl, MVT::i8));
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
+ // Insertion of one bit into last position
if (IdxVal == NumElems -1) {
// Move the bit to the last position inside the vector.
- EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
// Clean the last bit in the source vector.
- Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
DAG.getConstant(1, dl, MVT::i8));
- Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
DAG.getConstant(1 , dl, MVT::i8));
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
@@ -13790,17 +14420,20 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
auto *N2C = cast<ConstantSDNode>(N2);
unsigned IdxVal = N2C->getZExtValue();
- // If we are clearing out a element, we do this more efficiently with a
- // blend shuffle than a costly integer insertion.
- // TODO: would other rematerializable values (e.g. allbits) benefit as well?
- // TODO: pre-SSE41 targets will tend to use bit masking - this could still
- // be beneficial if we are inserting several zeros and can combine the masks.
- if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
- SmallVector<int, 8> ClearMask;
+ bool IsZeroElt = X86::isZeroNode(N1);
+ bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+ // If we are inserting a element, see if we can do this more efficiently with
+ // a blend shuffle with a rematerializable vector than a costly integer
+ // insertion.
+ if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+ 16 <= EltVT.getSizeInBits()) {
+ SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
- ClearMask.push_back(i == IdxVal ? i + NumElts : i);
- SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+ BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+ SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+ : DAG.getConstant(-1, dl, VT);
+ return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
}
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -13837,25 +14470,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
- if (Subtarget.hasSSE41()) {
- if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
- unsigned Opc;
- if (VT == MVT::v8i16) {
- Opc = X86ISD::PINSRW;
- } else {
- assert(VT == MVT::v16i8);
- Opc = X86ISD::PINSRB;
- }
-
- // Transform it so it match pinsr{b,w} which expects a GR32 as its second
- // argument.
- if (N1.getValueType() != MVT::i32)
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal, dl);
- return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument. SSE41 required for pinsrb.
+ if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ unsigned Opc;
+ if (VT == MVT::v8i16) {
+ assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+ Opc = X86ISD::PINSRW;
+ } else {
+ assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+ assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+ Opc = X86ISD::PINSRB;
}
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ }
+
+ if (Subtarget.hasSSE41()) {
if (EltVT == MVT::f32) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
@@ -13885,36 +14520,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
}
- if (EltVT == MVT::i32 || EltVT == MVT::i64) {
- // PINSR* works with constant index.
+ // PINSR* works with constant index.
+ if (EltVT == MVT::i32 || EltVT == MVT::i64)
return Op;
- }
}
- if (EltVT == MVT::i8)
- return SDValue();
-
- if (EltVT.getSizeInBits() == 16) {
- // Transform it so it match pinsrw which expects a 16-bit value in a GR32
- // as its second argument.
- if (N1.getValueType() != MVT::i32)
- N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
- if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal, dl);
- return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
- }
return SDValue();
}
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SDLoc dl(Op);
MVT OpVT = Op.getSimpleValueType();
+ // It's always cheaper to replace a xor+movd with xorps and simplifies further
+ // combines.
+ if (X86::isZeroNode(Op.getOperand(0)))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
- unsigned SizeFactor = OpVT.getSizeInBits()/128;
+ unsigned SizeFactor = OpVT.getSizeInBits() / 128;
MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
@@ -13923,9 +14551,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+
+ // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+ if (OpVT == MVT::v4i32)
+ return Op;
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
return DAG.getBitcast(
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
@@ -13943,24 +14575,33 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
MVT ResVT = Op.getSimpleValueType();
+ // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
+ // would result with: v1i1 = extract_subvector(vXi1, idx).
+ // Lower these into extract_vector_elt which is already selectable.
+ if (ResVT == MVT::v1i1) {
+ assert(Subtarget.hasAVX512() &&
+ "Boolean EXTRACT_SUBVECTOR requires AVX512");
+
+ MVT EltVT = ResVT.getVectorElementType();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT LegalVT =
+ (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
+ }
+
assert((In.getSimpleValueType().is256BitVector() ||
In.getSimpleValueType().is512BitVector()) &&
"Can only extract from 256-bit or 512-bit vectors");
- if (ResVT.is128BitVector())
- return extract128BitVector(In, IdxVal, DAG, dl);
- if (ResVT.is256BitVector())
- return extract256BitVector(In, IdxVal, DAG, dl);
-
- llvm_unreachable("Unimplemented!");
-}
+ // If the input is a buildvector just emit a smaller one.
+ unsigned ElemsPerChunk = ResVT.getVectorNumElements();
+ if (In.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getBuildVector(
+ ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
-static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
- for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
- if (llvm::all_of(ValidUsers,
- [&I](SDValue V) { return V.getNode() != *I; }))
- return false;
- return true;
+ // Everything else is legal.
+ return Op;
}
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
@@ -13968,83 +14609,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
-
- SDLoc dl(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue SubVec = Op.getOperand(1);
- SDValue Idx = Op.getOperand(2);
-
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- MVT OpVT = Op.getSimpleValueType();
- MVT SubVecVT = SubVec.getSimpleValueType();
-
- if (OpVT.getVectorElementType() == MVT::i1)
- return insert1BitVector(Op, DAG, Subtarget);
-
- assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
- "Can only insert into 256-bit or 512-bit vectors");
-
- // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
- // load:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr + 16), Elts/2)
- // --> load32 addr
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr + 32), Elts/2)
- // --> load64 addr
- // or a 16-byte or 32-byte broadcast:
- // (insert_subvector (insert_subvector undef, (load16 addr), 0),
- // (load16 addr), Elts/2)
- // --> X86SubVBroadcast(load16 addr)
- // or:
- // (insert_subvector (insert_subvector undef, (load32 addr), 0),
- // (load32 addr), Elts/2)
- // --> X86SubVBroadcast(load32 addr)
- if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
- Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
- auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
- if (Idx2 && Idx2->getZExtValue() == 0) {
- SDValue SubVec2 = Vec.getOperand(1);
- // If needed, look through bitcasts to get to the load.
- if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
- bool Fast;
- unsigned Alignment = FirstLd->getAlignment();
- unsigned AS = FirstLd->getAddressSpace();
- const X86TargetLowering *TLI = Subtarget.getTargetLowering();
- if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
- OpVT, AS, Alignment, &Fast) && Fast) {
- SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
- return Ld;
- }
- }
- // If lower/upper loads are the same and the only users of the load, then
- // lower to a VBROADCASTF128/VBROADCASTI128/etc.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
- if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
- areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
- }
- }
- // If this is subv_broadcast insert into both halves, use a larger
- // subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
- return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
- SubVec.getOperand(0));
- }
- }
- }
+ assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
- if (SubVecVT.is128BitVector())
- return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
- if (SubVecVT.is256BitVector())
- return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
- llvm_unreachable("Unimplemented!");
+ return insert1BitVector(Op, DAG, Subtarget);
}
// Returns the appropriate wrapper opcode for a global reference.
@@ -14062,7 +14629,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
}
// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
@@ -14417,7 +14984,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// location.
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
SDValue Args[] = { Chain, Offset };
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -14438,7 +15005,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
Subtarget.isTargetWindowsItanium() ||
Subtarget.isTargetWindowsGNU()) {
// Just use the implicit TLS architecture
- // Need to generate someting similar to:
+ // Need to generate something similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
// ; from TEB
// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
@@ -15040,8 +15607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
- SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
- Zero, Four);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
@@ -15313,7 +15879,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
SDValue Zero =
DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
- SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+ SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
if (VT == ExtVT)
return SelectedVal;
return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -15489,32 +16055,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// word to byte only under BWI
if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
return DAG.getNode(X86ISD::VTRUNC, DL, VT,
- DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+ getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
- // Truncate with PACKSS if we are truncating a vector comparison result.
- // TODO: We should be able to support other operations as long as we
- // we are saturating+packing zero/all bits only.
- auto IsPackableComparison = [](SDValue V) {
- unsigned Opcode = V.getOpcode();
- return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
- Opcode == X86ISD::CMPP);
- };
-
- if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
- all_of(In->ops(), IsPackableComparison))) {
+ // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
+ if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
return V;
- }
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v8i32, In);
- In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
- ShufMask);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
DAG.getIntPtrConstant(0, DL));
}
@@ -15530,30 +16085,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
}
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
- // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
In = DAG.getBitcast(MVT::v32i8, In);
- SmallVector<SDValue,32> pshufbMask;
- for (unsigned i = 0; i < 2; ++i) {
- pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
- for (unsigned j = 0; j < 8; ++j)
- pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
- }
- SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
- In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ // The PSHUFB mask:
+ static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ 16, 17, 20, 21, 24, 25, 28, 29,
+ -1, -1, -1, -1, -1, -1, -1, -1 };
+ In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
- static const int ShufMask[] = {0, 2, -1, -1};
- In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
- ShufMask);
+ static const int ShufMask2[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
DAG.getIntPtrConstant(0, DL));
return DAG.getBitcast(VT, In);
@@ -15572,9 +16117,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
-1, -1, -1, -1, -1, -1, -1, -1};
- SDValue Undef = DAG.getUNDEF(MVT::v16i8);
- OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -15598,17 +16142,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// Prepare truncation shuffle mask
for (unsigned i = 0; i != NumElems; ++i)
MaskVec[i] = i * 2;
- SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
- DAG.getUNDEF(NVT), MaskVec);
+ In = DAG.getBitcast(NVT, In);
+ SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
DAG.getIntPtrConstant(0, DL));
}
-SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
-
MVT VT = Op.getSimpleValueType();
if (VT.isVector()) {
@@ -15616,8 +16157,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
SDValue Src = Op.getOperand(0);
SDLoc dl(Op);
if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
- return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
- dl, VT,
+ return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32)));
}
@@ -15701,7 +16241,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
unsigned EltBits = EltVT.getSizeInBits();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
APInt MaskElt =
- IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+ IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
const fltSemantics &Sem =
EltVT == MVT::f64 ? APFloat::IEEEdouble() :
(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
@@ -15764,9 +16304,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
// The mask constants are automatically splatted for vector types.
unsigned EltSizeInBits = VT.getScalarSizeInBits();
SDValue SignMask = DAG.getConstantFP(
- APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+ APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
SDValue MagMask = DAG.getConstantFP(
- APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+ APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
// First, clear all bits but the sign bit from the second operand (sign).
if (IsFakeVector)
@@ -15891,7 +16431,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
- // If more than one full vectors are evaluated, OR them first before PTEST.
+ // If more than one full vector is evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
// Each iteration will OR 2 nodes and append the result until there is only
// 1 node left, i.e. the final OR'd value of all vectors.
@@ -15900,8 +16440,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
- VecIns.back(), VecIns.back());
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
/// \brief return true if \c Op has a use that doesn't just read flags.
@@ -15970,11 +16509,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
- case ISD::SHL: {
- const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
- if (BinNode->Flags.hasNoSignedWrap())
+ case ISD::SHL:
+ if (Op.getNode()->getFlags().hasNoSignedWrap())
break;
- }
+ LLVM_FALLTHROUGH;
default:
NeedOF = true;
break;
@@ -16366,7 +16904,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
}
/// If we have at least two divisions that use the same divisor, convert to
-/// multplication by a reciprocal. This may need to be adjusted for a given
+/// multiplication by a reciprocal. This may need to be adjusted for a given
/// CPU if a division's cost is not at least twice the cost of a multiplication.
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
@@ -16432,9 +16970,9 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
unsigned BitWidth = Op0.getValueSizeInBits();
unsigned AndBitWidth = And.getValueSizeInBits();
if (BitWidth > AndBitWidth) {
- APInt Zeros, Ones;
- DAG.computeKnownBits(Op0, Zeros, Ones);
- if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+ KnownBits Known;
+ DAG.computeKnownBits(Op0, Known);
+ if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
return SDValue();
}
LHS = Op1;
@@ -16682,7 +17220,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
MVT VT = Op.getSimpleValueType();
- ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
@@ -16709,18 +17247,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
// available.
SDValue Cmp;
- unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+ unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
if (SSECC == 8) {
// LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
- if (SetCCOpcode == ISD::SETUEQ) {
+ if (Cond == ISD::SETUEQ) {
CC0 = 3; // UNORD
CC1 = 0; // EQ
CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
static_cast<unsigned>(ISD::OR);
} else {
- assert(SetCCOpcode == ISD::SETONE);
+ assert(Cond == ISD::SETONE);
CC0 = 7; // ORD
CC1 = 4; // NEQ
CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
@@ -16767,7 +17305,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// 2. The original operand type has been promoted to a 256-bit vector.
//
// Note that condition 2. only applies for AVX targets.
- SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+ SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
return DAG.getZExtOrTrunc(NewOp, dl, VT);
}
@@ -16807,7 +17345,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
- switch (SetCCOpcode) {
+ switch (Cond) {
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETULT:
case ISD::SETLT: CmpMode = 0x00; break;
@@ -16822,60 +17360,55 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
// Are we comparing unsigned or signed integers?
- unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
- ? X86ISD::VPCOMU : X86ISD::VPCOM;
+ unsigned Opc =
+ ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
DAG.getConstant(CmpMode, dl, MVT::i8));
}
- // We are handling one of the integer comparisons here. Since SSE only has
+ // We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
- unsigned Opc;
- bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
- bool Subus = false;
-
- switch (SetCCOpcode) {
- default: llvm_unreachable("Unexpected SETCC condition");
- case ISD::SETNE: Invert = true;
- case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
- case ISD::SETLT: Swap = true;
- case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
- case ISD::SETGE: Swap = true;
- case ISD::SETLE: Opc = X86ISD::PCMPGT;
- Invert = true; break;
- case ISD::SETULT: Swap = true;
- case ISD::SETUGT: Opc = X86ISD::PCMPGT;
- FlipSigns = true; break;
- case ISD::SETUGE: Swap = true;
- case ISD::SETULE: Opc = X86ISD::PCMPGT;
- FlipSigns = true; Invert = true; break;
- }
+ unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+ : X86ISD::PCMPGT;
+ bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+ Cond == ISD::SETGE || Cond == ISD::SETUGE;
+ bool Invert = Cond == ISD::SETNE ||
+ (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
+ // If both operands are known non-negative, then an unsigned compare is the
+ // same as a signed compare and there's no need to flip signbits.
+ // TODO: We could check for more general simplifications here since we're
+ // computing known bits.
+ bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
+ !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
// Special case: Use min/max operations for SETULE/SETUGE
MVT VET = VT.getVectorElementType();
- bool hasMinMax =
- (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
- || (Subtarget.hasSSE2() && (VET == MVT::i8));
-
- if (hasMinMax) {
- switch (SetCCOpcode) {
+ bool HasMinMax =
+ (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
+ (Subtarget.hasSSE2() && (VET == MVT::i8));
+ bool MinMax = false;
+ if (HasMinMax) {
+ switch (Cond) {
default: break;
case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
}
- if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+ if (MinMax)
+ Swap = Invert = FlipSigns = false;
}
- bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
- if (!MinMax && hasSubus) {
+ bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+ bool Subus = false;
+ if (!MinMax && HasSubus) {
// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
// Op0 u<= Op1:
// t = psubus Op0, Op1
// pcmpeq t, <0..0>
- switch (SetCCOpcode) {
+ switch (Cond) {
default: break;
case ISD::SETULT: {
// If the comparison is against a constant we can turn this into a
@@ -16977,10 +17510,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// bits of the inputs before performing those operations.
if (FlipSigns) {
MVT EltVT = VT.getVectorElementType();
- SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+ SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
VT);
- Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
- Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
}
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -17005,8 +17538,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
- assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
- && "SetCC type must be 8-bit or 1-bit integer");
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDLoc dl(Op);
@@ -17070,19 +17602,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SetCC;
}
-SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
- assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
- assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+ // Recreate the carry if needed.
+ EVT CarryVT = Carry.getValueType();
+ APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+ Carry, DAG.getConstant(NegOne, DL, CarryVT));
+
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
- SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+ SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
if (Op.getSimpleValueType() == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -17140,7 +17677,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SSECC != 8) {
if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
DL, VT, Cmp, Op1, Op2);
@@ -17176,7 +17713,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
VCmp = DAG.getBitcast(VCmpVT, VCmp);
- SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+ SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
VSel, DAG.getIntPtrConstant(0, DL));
@@ -17188,9 +17725,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
// AVX512 fallback is to lower selects of scalar floats to masked moves.
- if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
- Subtarget.hasAVX512())
- return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+ if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+ SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+ }
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
@@ -17204,9 +17742,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
Op2Scalar = Op2.getOperand(0);
if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
- Op1Scalar.getValueType(),
- Cond, Op1Scalar, Op2Scalar);
+ SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+ Op1Scalar, Op2Scalar);
if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
return DAG.getBitcast(VT, newSelect);
SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17221,8 +17758,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
- Cond, Op1, Op2);
+ SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
}
@@ -17241,33 +17777,33 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+ // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+ // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
if (Cond.getOpcode() == X86ISD::SETCC &&
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
-
- unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+ unsigned CondCode =
+ cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
SDValue CmpOp0 = Cmp.getOperand(0);
+
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
if (isNullConstant(Y) &&
- (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
- SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
- DAG.getConstant(0, DL,
- CmpOp0.getValueType()),
- CmpOp0);
- SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- SDValue(Neg.getNode(), 1));
- return Res;
- }
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -17283,6 +17819,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (!isNullConstant(Op2))
Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
return Res;
+ } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+ Cmp.getOperand(0).getOpcode() == ISD::AND &&
+ isOneConstant(Cmp.getOperand(0).getOperand(1))) {
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ SDValue Src1, Src2;
+ // true if Op2 is XOR or OR operator and one of its operands
+ // is equal to Op1
+ // ( a , a op b) || ( b , a op b)
+ auto isOrXorPattern = [&]() {
+ if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
+ (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
+ Src1 =
+ Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
+ Src2 = Op1;
+ return true;
+ }
+ return false;
+ };
+
+ if (isOrXorPattern()) {
+ SDValue Neg;
+ unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
+ // we need mask of all zeros or ones with same size of the other
+ // operands.
+ if (CmpSz > VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
+ else if (CmpSz < VT.getSizeInBits())
+ Neg = DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
+ DAG.getConstant(1, DL, VT));
+ else
+ Neg = CmpOp0;
+ SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Neg); // -(and (x, 0x1))
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+ return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
+ }
}
}
@@ -17423,17 +17996,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
// SKX processor
if ((InVTElt == MVT::i1) &&
- (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
- VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
-
- ((Subtarget.hasBWI() && VT.is512BitVector() &&
- VTElt.getSizeInBits() <= 16)) ||
+ (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
- ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
- VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+ ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
- ((Subtarget.hasDQI() && VT.is512BitVector() &&
- VTElt.getSizeInBits() >= 32))))
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
unsigned NumElts = VT.getVectorNumElements();
@@ -17441,8 +18007,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
if (VT.is512BitVector() && InVTElt != MVT::i1 &&
(NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
- return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
- return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
+ return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
}
if (InVTElt != MVT::i1)
@@ -17454,12 +18020,12 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
SDValue V;
if (Subtarget.hasDQI()) {
- V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+ V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
assert(!VT.is512BitVector() && "Unexpected vector type");
} else {
- SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+ SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
- V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
if (ExtVT == VT)
return V;
}
@@ -17506,11 +18072,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
- // SSE41 targets can use the pmovsx* instructions directly.
- unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
- X86ISD::VSEXT : X86ISD::VZEXT;
- if (Subtarget.hasSSE41())
+ // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
+ // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+ // need to be handled here for 256/512-bit results.
+ if (Subtarget.hasInt256()) {
+ assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+ unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+ X86ISD::VSEXT : X86ISD::VZEXT;
return DAG.getNode(ExtOpc, dl, VT, In);
+ }
// We should only get here for sign extend.
assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
@@ -17595,8 +18165,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
- OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
- OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+ OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+ OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
@@ -17674,7 +18244,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
MVT VT = Op.getValueType().getSimpleVT();
unsigned NumElts = VT.getVectorNumElements();
- if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+ if ((Subtarget.hasBWI() && NumElts >= 32) ||
+ (Subtarget.hasDQI() && NumElts < 16) ||
NumElts == 16) {
// Load and extend - everything is legal
if (NumElts < 8) {
@@ -17703,7 +18274,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
if (NumElts <= 8) {
// A subset, assume that we have only AVX-512F
- unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+ unsigned NumBitsToLoad = 8;
MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
Ld->getBasePtr(),
@@ -17911,7 +18482,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
if (Ext == ISD::SEXTLOAD) {
// If we have SSE4.1, we can directly emit a VSEXT node.
if (Subtarget.hasSSE41()) {
- SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Sext;
}
@@ -18243,8 +18814,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
+ bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
- SplitStack;
+ SplitStack || EmitStackProbe;
SDLoc dl(Op);
// Get the inputs.
@@ -18256,7 +18828,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
bool Is64Bit = Subtarget.is64Bit();
MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -18469,6 +19041,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
SelectionDAG &DAG) {
MVT ElementType = VT.getVectorElementType();
+ // Bitcast the source vector to the output type, this is mainly necessary for
+ // vXi8/vXi64 shifts.
+ if (VT != SrcOp.getSimpleValueType())
+ SrcOp = DAG.getBitcast(VT, SrcOp);
+
// Fold this packed shift into its first operand if ShiftAmt is 0.
if (ShiftAmt == 0)
return SrcOp;
@@ -18485,9 +19062,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
&& "Unknown target vector shift-by-constant node");
// Fold this packed vector shift into a build vector if SrcOp is a
- // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
- if (VT == SrcOp.getSimpleValueType() &&
- ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+ // vector of Constants or UNDEFs.
+ if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
SmallVector<SDValue, 8> Elts;
unsigned NumElts = SrcOp->getNumOperands();
ConstantSDNode *ND;
@@ -18578,11 +19154,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
ShAmt = ShAmt.getOperand(0);
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
- ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
} else if (Subtarget.hasSSE41() &&
ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
- ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+ ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
} else {
SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -18692,8 +19268,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
/// \brief Creates an SDNode for a predicated scalar operation.
/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is coming as MVT::i8 and it should be truncated
-/// to MVT::i1 while lowering masking intrinsics.
+/// The mask is coming as MVT::i8 and it should be transformed
+/// to MVT::v1i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
/// "X86select" instead of "vselect". We just can't create the "vselect" node
/// for a scalar instruction.
@@ -18701,19 +19277,19 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (isAllOnesConstant(Mask))
- return Op;
+
+ if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+ if (MaskConst->getZExtValue() & 0x1)
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- // The mask should be of type MVT::i1
- SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+ SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
if (Op.getOpcode() == X86ISD::FSETCCM ||
Op.getOpcode() == X86ISD::FSETCCM_RND)
return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
- if (Op.getOpcode() == X86ISD::VFPCLASS ||
- Op.getOpcode() == X86ISD::VFPCLASSS)
+ if (Op.getOpcode() == X86ISD::VFPCLASSS)
return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
if (PreservedSrc.isUndef())
@@ -18762,7 +19338,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
// registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -18853,6 +19429,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Src2 = Op.getOperand(2);
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, VT, Src1, Src2, Rnd),
+ Mask, passThru, Subtarget, DAG);
+ }
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
Mask, passThru, Subtarget, DAG);
}
@@ -19142,10 +19726,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Src1 = Op.getOperand(1);
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
- SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
}
case CMP_MASK:
case CMP_MASK_CC: {
@@ -19205,18 +19790,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
if (IntrData->Opc1 != 0) {
SDValue Rnd = Op.getOperand(5);
if (!isRoundModeCurDirection(Rnd))
- Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
}
//default rounding mode
if(!Cmp.getNode())
- Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
DAG.getTargetConstant(0, dl,
MVT::i1),
Subtarget, DAG);
-
- return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
+ DAG.getIntPtrConstant(0, dl));
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -19264,13 +19849,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
- FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8));
+ FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8));
else
- FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8), Sae);
- // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
- return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+ FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+ DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
+ DAG.getIntPtrConstant(0, dl));
}
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -19306,6 +19891,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src2, Src1);
return DAG.getBitcast(VT, Res);
}
+ case MASK_BINOP: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
+ return DAG.getBitcast(VT, Res);
+ }
case FIXUPIMMS:
case FIXUPIMMS_MASKZ:
case FIXUPIMM:
@@ -19347,12 +19941,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
- case CONVERT_MASK_TO_VEC: {
- SDValue Mask = Op.getOperand(1);
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
- }
case BRCST_SUBVEC_TO_VEC: {
SDValue Src = Op.getOperand(1);
SDValue Passthru = Op.getOperand(2);
@@ -19478,6 +20066,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ case Intrinsic::x86_avx512_knot_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
+ SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
+ case Intrinsic::x86_avx512_kandn_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ // Invert LHS for the not.
+ LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
+ DAG.getConstant(1, dl, MVT::v16i1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
+ case Intrinsic::x86_avx512_kxnor_w: {
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+ // Invert result for the not.
+ Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
+ DAG.getConstant(1, dl, MVT::v16i1));
+ return DAG.getBitcast(MVT::i16, Res);
+ }
+
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:
@@ -19569,7 +20184,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Op1 = Op.getOperand(1);
auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
- GlobalValue::getRealLinkageName(Fn->getName()));
+ GlobalValue::dropLLVMManglingEscape(Fn->getName()));
// Generate a simple absolute symbol reference. This intrinsic is only
// supported on 32-bit Windows, which isn't PIC.
@@ -19603,12 +20218,40 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
}
}
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ EVT MaskVT = Mask.getValueType();
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
- auto *C = cast<ConstantSDNode>(ScaleOp);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
MVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
@@ -19617,7 +20260,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- if (Src.isUndef())
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let ExecutionDepsFix deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -19630,7 +20276,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
- auto *C = cast<ConstantSDNode>(ScaleOp);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -19649,14 +20298,16 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
SDLoc dl(Op);
- auto *C = cast<ConstantSDNode>(ScaleOp);
+ auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+ // Scale must be constant.
+ if (!C)
+ return SDValue();
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- //SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
return SDValue(Res, 0);
@@ -19884,16 +20535,17 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+ const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
- if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+ switch (IntNo) {
+ case llvm::Intrinsic::x86_seh_ehregnode:
return MarkEHRegistrationNode(Op, DAG);
- if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+ case llvm::Intrinsic::x86_seh_ehguard:
return MarkEHGuard(Op, DAG);
- if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
- IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
- IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
- IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+ case llvm::Intrinsic::x86_flags_read_u32:
+ case llvm::Intrinsic::x86_flags_read_u64:
+ case llvm::Intrinsic::x86_flags_write_u32:
+ case llvm::Intrinsic::x86_flags_write_u64: {
// We need a frame pointer because this will get lowered to a PUSH/POP
// sequence.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -19902,6 +20554,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
// during ExpandISelPseudos in EmitInstrWithCustomInserter.
return SDValue();
}
+ case Intrinsic::x86_lwpins32:
+ case Intrinsic::x86_lwpins64: {
+ SDLoc dl(Op);
+ SDValue Chain = Op->getOperand(0);
+ SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+ SDValue LwpIns =
+ DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
+ Op->getOperand(3), Op->getOperand(4));
+ SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
+ SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+ LwpIns.getValue(1));
+ }
+ }
return SDValue();
}
@@ -19928,6 +20594,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
+ case GATHER_AVX2: {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
@@ -19953,8 +20629,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
- assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
- unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+ assert((HintVal == 2 || HintVal == 3) &&
+ "Wrong prefetch hint in intrinsic: should be 2 or 3");
+ unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
@@ -19994,8 +20671,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
}
// ADC/ADCX/SBB
case ADX: {
- SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
- SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+ SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+ SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
DAG.getConstant(-1, dl, MVT::i8));
SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
@@ -20368,7 +21045,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// Check that ECX wasn't needed by an 'inreg' parameter.
FunctionType *FTy = Func->getFunctionType();
- const AttributeSet &Attrs = Func->getAttributes();
+ const AttributeList &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
unsigned InRegCount = 0;
@@ -20503,54 +21180,62 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
+// Split an unary integer op into 2 half sized ops.
+static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+
+ // Extract the Lo/Hi vectors
+ SDLoc dl(Op);
+ SDValue Src = Op.getOperand(0);
+ SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
+ SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
+
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
+}
+
+// Decompose 256-bit ops into smaller 128-bit ops.
+static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().is256BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return LowerVectorIntUnary(Op, DAG);
+}
+
+// Decompose 512-bit ops into smaller 256-bit ops.
+static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType().is512BitVector() &&
+ Op.getSimpleValueType().isInteger() &&
+ "Only handle AVX 512-bit vector integer operation");
+ return LowerVectorIntUnary(Op, DAG);
+}
+
/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
//
-// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
-// to 512-bit vector.
-// 2. i8/i16 vector implemented using dword LZCNT vector instruction
-// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
-// split the vector, perform operation on it's Lo a Hi part and
-// concatenate the results.
-static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+// i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::CTLZ);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
- if (EltVT == MVT::i64 || EltVT == MVT::i32) {
- // Extend to 512 bit vector.
- assert((VT.is256BitVector() || VT.is128BitVector()) &&
- "Unsupported value type for operation");
-
- MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
- SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
- DAG.getUNDEF(NewVT),
- Op.getOperand(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
-
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
- DAG.getIntPtrConstant(0, dl));
- }
-
assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
"Unsupported element type");
- if (16 < NumElems) {
- // Split vector, it's Lo and Hi parts will be handled in next iteration.
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
- Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
- }
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ if (16 < NumElems)
+ return LowerVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
-
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation");
@@ -20637,23 +21322,17 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- SDValue Op0 = Op.getOperand(0);
- if (Subtarget.hasAVX512())
- return LowerVectorCTLZ_AVX512(Op, DAG);
+ if (Subtarget.hasCDI())
+ return LowerVectorCTLZ_AVX512CDI(Op, DAG);
// Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector() && !Subtarget.hasInt256()) {
- unsigned NumElems = VT.getVectorNumElements();
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntUnary(Op, DAG);
- // Extract each 128-bit vector, perform ctlz and concat the result.
- SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
- SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
- DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
- DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
- }
+ // Decompose 512-bit ops into smaller 256-bit ops.
+ if (VT.is512BitVector() && !Subtarget.hasBWI())
+ return Lower512IntUnary(Op, DAG);
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
@@ -20802,9 +21481,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
}
-static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() == MVT::i1)
- return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getScalarType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
@@ -20812,14 +21492,11 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
return Lower256IntArith(Op, DAG);
}
-static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
- if (Op.getValueType() == MVT::i1)
- return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
- Op.getOperand(0), Op.getOperand(1));
+static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return Lower256IntArith(Op, DAG);
+ return Lower256IntUnary(Op, DAG);
}
static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
@@ -20834,7 +21511,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- if (VT == MVT::i1)
+ if (VT.getScalarType() == MVT::i1)
return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
// Decompose 256-bit ops into smaller 128-bit ops.
@@ -20874,8 +21551,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// Extract the lo parts and sign extend to i16
SDValue ALo, BLo;
if (Subtarget.hasSSE41()) {
- ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
- BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
+ BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
} else {
const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-1, 4, -1, 5, -1, 6, -1, 7};
@@ -20894,8 +21571,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
- BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
+ BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
} else {
const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
-1, 12, -1, 13, -1, 14, -1, 15};
@@ -21056,8 +21733,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
}
- SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
- SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+ SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
+ SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
DAG.getConstant(8, dl, MVT::v16i16));
@@ -21073,8 +21750,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Extract the lo parts and zero/sign extend to i16.
SDValue ALo, BLo;
if (Subtarget.hasSSE41()) {
- ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
- BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+ ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
+ BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
} else {
const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
-1, 4, -1, 5, -1, 6, -1, 7};
@@ -21093,8 +21770,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
-1, -1, -1, -1, -1, -1, -1, -1};
AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
- AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
- BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+ AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
+ BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
} else {
const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
-1, 12, -1, 13, -1, 14, -1, 15};
@@ -21148,8 +21825,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
MachinePointerInfo(), /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
- Entry.isSExt = false;
- Entry.isZExt = false;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
Args.push_back(Entry);
}
@@ -21157,11 +21834,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
getPointerTy(DAG.getDataLayout()));
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(InChain)
- .setCallee(getLibcallCallingConv(LC),
- static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
- Callee, std::move(Args))
- .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+ CLI.setDebugLoc(dl)
+ .setChain(InChain)
+ .setLibCallee(
+ getLibcallCallingConv(LC),
+ static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+ std::move(Args))
+ .setInRegister()
+ .setSExtResult(isSigned)
+ .setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
return DAG.getBitcast(VT, CallInfo.first);
@@ -21269,15 +21950,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
if (VT.getScalarSizeInBits() < 16)
return false;
- if (VT.is512BitVector() &&
+ if (VT.is512BitVector() && Subtarget.hasAVX512() &&
(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
return true;
- bool LShift = VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasInt256());
+ bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256());
- bool AShift = LShift && (Subtarget.hasVLX() ||
- (VT != MVT::v2i64 && VT != MVT::v4i64));
+ bool AShift = LShift && (Subtarget.hasAVX512() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
return (Opcode == ISD::SRA) ? AShift : LShift;
}
@@ -21301,7 +21982,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
return false;
- if (VT.is512BitVector() || Subtarget.hasVLX())
+ if (Subtarget.hasAVX512())
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -21324,6 +22005,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
SDValue Ex = DAG.getBitcast(ExVT, R);
+ // ashr(R, 63) === cmp_slt(R, 0)
+ if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+ assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+ "Unsupported PCMPGT op");
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+ getZeroVector(VT, Subtarget, DAG, dl), R);
+ }
+
if (ShiftAmt >= 32) {
// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
SDValue Upper =
@@ -21360,8 +22049,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
// i64 SRA needs to be performed as partial shifts.
- if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
- Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
+ if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
+ (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+ Op.getOpcode() == ISD::SRA)
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 ||
@@ -21422,10 +22112,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
(VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
(Subtarget.hasAVX512() && VT == MVT::v8i64))) {
+ // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+ unsigned SubVectorScale = 1;
+ if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ SubVectorScale =
+ Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+ Amt = Amt.getOperand(0);
+ }
+
// Peek through any splat that was introduced for i64 shift vectorization.
int SplatIndex = -1;
if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21442,7 +22141,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
Amt = Amt.getOperand(0);
unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
- VT.getVectorNumElements();
+ (SubVectorScale * VT.getVectorNumElements());
unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
uint64_t ShiftAmt = 0;
unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -21610,11 +22309,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
// i64 vector arithmetic shift can be emulated with the transform:
- // M = lshr(SIGN_BIT, Amt)
+ // M = lshr(SIGN_MASK, Amt)
// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
Op.getOpcode() == ISD::SRA) {
- SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+ SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
R = DAG.getNode(ISD::XOR, dl, VT, R, M);
@@ -21816,23 +22515,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
// On SSE41 targets we make use of the fact that VSELECT lowers
// to PBLENDVB which selects bytes based just on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT,
- DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
- return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+ return DAG.getSelect(dl, SelVT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -21954,15 +22651,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
- return DAG.getBitcast(
- VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+ return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
// its OR(AND(V0,C),AND(V1,~C)) lowering.
SDValue C =
DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
- return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+ return DAG.getSelect(dl, VT, C, V0, V1);
};
// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -22017,10 +22713,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
SDLoc DL(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
+ unsigned Opcode = Op.getOpcode();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+ if (Subtarget.hasAVX512()) {
+ // Attempt to rotate by immediate.
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
+ if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
+ return EltBits[0] == V;
+ })) {
+ unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+ uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
+ return DAG.getNode(Op, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Else, fall-back on VPROLV/VPRORV.
+ return Op;
+ }
assert(VT.isVector() && "Custom lowering only for vector rotates!");
assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
- assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+ assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
// +ve/-ve Amt = rotate left/right.
@@ -22035,7 +22752,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
- assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+ assert(RotateAmt < EltSizeInBits && "Rotation out of range");
return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
DAG.getConstant(RotateAmt, DL, MVT::i8));
}
@@ -22062,10 +22779,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// A subtract of one will be selected as a INC. Note that INC doesn't
// set CF, so we can't do this for UADDO.
if (isOneConstant(RHS)) {
- BaseOp = X86ISD::INC;
- Cond = X86::COND_O;
- break;
- }
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
BaseOp = X86ISD::ADD;
Cond = X86::COND_O;
break;
@@ -22077,10 +22794,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// A subtract of one will be selected as a DEC. Note that DEC doesn't
// set CF, so we can't do this for USUBO.
if (isOneConstant(RHS)) {
- BaseOp = X86ISD::DEC;
- Cond = X86::COND_O;
- break;
- }
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
BaseOp = X86ISD::SUB;
Cond = X86::COND_O;
break;
@@ -22146,7 +22863,7 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
TargetLowering::AtomicExpansionKind
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
- auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+ auto PTy = cast<PointerType>(LI->getPointerOperandType());
return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
: AtomicExpansionKind::None;
}
@@ -22202,7 +22919,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- auto SynchScope = AI->getSynchScope();
+ auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
@@ -22224,7 +22941,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
- if (SynchScope == SingleThread)
+ if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
@@ -22243,7 +22960,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
AI->getType()->getPrimitiveSizeInBits());
- Loaded->setAtomic(Order, SynchScope);
+ Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
@@ -22254,13 +22971,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
- SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
- FenceScope == CrossThread) {
+ FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
@@ -22470,7 +23187,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
// index into a in-register pre-computed pop count table. We then split up the
// input vector in two new ones: (1) a vector with only the shifted-right
// higher nibbles for each byte and (2) a vector with the lower nibbles (and
- // masked out higher ones) for each byte. PSHUB is used separately with both
+ // masked out higher ones) for each byte. PSHUFB is used separately with both
// to index the in-register table. Next, both are added and the result is a
// i8 vector where each element contains the pop count for input byte.
//
@@ -22588,35 +23305,33 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);
+ // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
+ if (Subtarget.hasVPOPCNTDQ()) {
+ if (VT == MVT::v8i16) {
+ Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+ }
+ if (VT == MVT::v16i8 || VT == MVT::v16i16) {
+ Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+ }
+ }
+
if (!Subtarget.hasSSSE3()) {
// We can't use the fast LUT approach, so fall back on vectorized bitmath.
assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
}
- if (VT.is256BitVector() && !Subtarget.hasInt256()) {
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract each 128-bit vector, compute pop count and concat the result.
- SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
- SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
- LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
- LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
- }
-
- if (VT.is512BitVector() && !Subtarget.hasBWI()) {
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract each 256-bit vector, compute pop count and concat the result.
- SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
- SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntUnary(Op, DAG);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
- LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
- LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
- }
+ // Decompose 512-bit ops into smaller 256-bit ops.
+ if (VT.is512BitVector() && !Subtarget.hasBWI())
+ return Lower512IntUnary(Op, DAG);
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
@@ -22643,20 +23358,12 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
DAG.getIntPtrConstant(0, DL));
}
- MVT SVT = VT.getVectorElementType();
int NumElts = VT.getVectorNumElements();
int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
// Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.is256BitVector()) {
- SDValue Lo = extract128BitVector(In, 0, DAG, DL);
- SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
-
- MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
- DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
- DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
- }
+ if (VT.is256BitVector())
+ return Lower256IntUnary(Op, DAG);
assert(VT.is128BitVector() &&
"Only 128-bit vector bitreverse lowering supported.");
@@ -22697,14 +23404,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
"Only byte vector BITREVERSE supported");
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
- if (VT.is256BitVector() && !Subtarget.hasInt256()) {
- MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
- SDValue Lo = extract128BitVector(In, 0, DAG, DL);
- SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
- Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
- Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
- }
+ if (VT.is256BitVector() && !Subtarget.hasInt256())
+ return Lower256IntUnary(Op, DAG);
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
@@ -22824,30 +23525,33 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
return Op;
}
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getNode()->getSimpleValueType(0);
+static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ MVT VT = N->getSimpleValueType(0);
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDLoc DL(N);
- unsigned Opc;
- bool ExtraOp = false;
- switch (Op.getOpcode()) {
- default: llvm_unreachable("Invalid code");
- case ISD::ADDC: Opc = X86ISD::ADD; break;
- case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
- case ISD::SUBC: Opc = X86ISD::SUB; break;
- case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
- }
+ // Set the carry flag.
+ SDValue Carry = Op.getOperand(2);
+ EVT CarryVT = Carry.getValueType();
+ APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+ Carry, DAG.getConstant(NegOne, DL, CarryVT));
+
+ unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
+ SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
+ Op.getOperand(1), Carry.getValue(1));
+
+ SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
+ if (N->getValueType(1) == MVT::i1)
+ SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
- if (!ExtraOp)
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1), Op.getOperand(2));
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
@@ -22867,8 +23571,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
Entry.Node = Arg;
Entry.Ty = ArgTy;
- Entry.isSExt = false;
- Entry.isZExt = false;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
Args.push_back(Entry);
bool isF64 = ArgVT == MVT::f64;
@@ -22880,13 +23584,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
- Type *RetTy = isF64
- ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
- : (Type*)VectorType::get(ArgTy, 4);
+ Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+ : (Type *)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -22923,8 +23627,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
"Unexpected request for vector widening");
- EVT EltVT = NVT.getVectorElementType();
-
SDLoc dl(InOp);
if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
InOp.getNumOperands() == 2) {
@@ -22942,6 +23644,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
for (unsigned i = 0; i < InNumElts; ++i)
Ops.push_back(InOp.getOperand(i));
+ EVT EltVT = InOp.getOperand(0).getValueType();
+
SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
DAG.getUNDEF(EltVT);
for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
@@ -23086,7 +23790,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
// Mask element has to be i1.
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+ "We handle 4x32, 4x64 and 2x64 vectors only in this case");
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
@@ -23142,7 +23846,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
// Mask element has to be i1.
MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
- "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+ "We handle 4x32, 4x64 and 2x64 vectors only in this case");
MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
@@ -23202,7 +23906,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
- // The pass-thru value
+ // The pass-through value
MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
Src0 = ExtendToType(Src0, NewVT, DAG);
@@ -23216,6 +23920,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue RetOps[] = {Exract, NewGather.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
+ if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+ // There is a special case when the return type is v2i32 is illegal and
+ // the type legaizer extended it to v2i64. Without this conversion we end up
+ // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+ // In order to avoid this situation, we'll build an X86 specific Gather node
+ // with index v2i64 and value type v4i32.
+ assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+ "Unexpected type in masked gather");
+ Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+ DAG.getBitcast(MVT::v4i32, Src0),
+ DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+ // The mask should match the destination type. Extending mask with zeroes
+ // is not necessary since instruction itself reads only two values from
+ // memory.
+ Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+ NewGather.getValue(0), DAG);
+ SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+ // This transformation is for optimization only.
+ // The type legalizer extended mask and index to 4 elements vector
+ // in order to match requirements of the common gather node - same
+ // vector width of index and value. X86 Gather node allows mismatch
+ // of vector width in order to select more optimal instruction at the
+ // end.
+ assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+ "Unexpected type in masked gather");
+ if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+ Index.getOpcode() == ISD::CONCAT_VECTORS &&
+ Index.getOperand(1).isUndef()) {
+ Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+ Index = Index.getOperand(0);
+ } else
+ return Op;
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+
+ }
return Op;
}
@@ -23284,7 +24039,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
- case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
@@ -23303,7 +24058,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
case ISD::FABS:
@@ -23311,7 +24066,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
- case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
@@ -23344,7 +24099,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
case ISD::UMUL_LOHI:
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
- case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
+ case ISD::ROTL:
+ case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
@@ -23356,16 +24112,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMULO: return LowerXALUO(Op, DAG);
case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::SUBC:
- case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
- case ISD::ADD: return LowerADD(Op, DAG);
- case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::ADDCARRY:
+ case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
+ case ISD::ADD:
+ case ISD::SUB: return LowerADD_SUB(Op, DAG);
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN: return LowerMINMAX(Op, DAG);
+ case ISD::ABS: return LowerABS(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
@@ -23768,7 +24523,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
- case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
@@ -23779,16 +24533,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
- case X86ISD::ABS: return "X86ISD::ABS";
case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMAXS: return "X86ISD::FMAXS";
case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
+ case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMINS: return "X86ISD::FMINS";
case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
+ case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
+ case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
@@ -23827,7 +24584,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
- case X86ISD::VINSERT: return "X86ISD::VINSERT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
@@ -23876,6 +24632,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TESTNM: return "X86ISD::TESTNM";
case X86ISD::KORTEST: return "X86ISD::KORTEST";
case X86ISD::KTEST: return "X86ISD::KTEST";
+ case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
+ case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
case X86ISD::PACKSS: return "X86ISD::PACKSS";
case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
@@ -23976,9 +24734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
@@ -24012,6 +24774,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
+ case X86ISD::LWPINS: return "X86ISD::LWPINS";
+ case X86ISD::MGATHER: return "X86ISD::MGATHER";
}
return nullptr;
}
@@ -24236,16 +25000,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
// xbegin sinkMBB
//
// mainMBB:
- // eax = -1
+ // s0 = -1
+ //
+ // fallBB:
+ // eax = # XABORT_DEF
+ // s1 = eax
//
// sinkMBB:
- // v = eax
+ // v = phi(s0/mainBB, s1/fallBB)
MachineBasicBlock *thisMBB = MBB;
MachineFunction *MF = MBB->getParent();
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
MF->insert(I, mainMBB);
+ MF->insert(I, fallMBB);
MF->insert(I, sinkMBB);
// Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -24253,25 +25023,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned fallDstReg = MRI.createVirtualRegister(RC);
+
// thisMBB:
- // xbegin sinkMBB
+ // xbegin fallMBB
// # fallthrough to mainMBB
- // # abortion to sinkMBB
- BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ // # abortion to fallMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
thisMBB->addSuccessor(mainMBB);
- thisMBB->addSuccessor(sinkMBB);
+ thisMBB->addSuccessor(fallMBB);
// mainMBB:
- // EAX = -1
- BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ // mainDstReg := -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+ BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
mainMBB->addSuccessor(sinkMBB);
- // sinkMBB:
- // EAX is live into the sinkMBB
- sinkMBB->addLiveIn(X86::EAX);
- BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
- MI.getOperand(0).getReg())
+ // fallMBB:
+ // ; pseudo instruction to model hardware's definition from XABORT
+ // EAX := XABORT_DEF
+ // fallDstReg := EAX
+ BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
+ BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
.addReg(X86::EAX);
+ fallMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(fallDstReg).addMBB(fallMBB);
MI.eraseFromParent();
return sinkMBB;
@@ -24302,7 +25087,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
- MIB.addOperand(Op);
+ MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24338,7 +25123,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
for (unsigned i = 1; i < NumArgs; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
- MIB.addOperand(Op);
+ MIB.add(Op);
}
if (MI.hasOneMemOperand())
MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24398,7 +25183,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
unsigned ValOps = X86::AddrNumOperands;
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
@@ -24413,6 +25198,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
return BB;
}
+static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget &Subtarget) {
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ // Address into RAX/EAX
+ unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+ unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+ for (int i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.add(MI->getOperand(i));
+
+ // The instruction doesn't actually take any operands though.
+ BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
+
+ MI->eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
+
+
MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const {
@@ -24536,12 +25341,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Load the offset value into a register
OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, UseFPOffset ? 4 : 0)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -24561,12 +25366,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Read the reg_save_area address.
unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 16)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 16)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// Zero-extend the offset
unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -24588,13 +25393,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Store it back into the va_list.
BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, UseFPOffset ? 4 : 0)
- .addOperand(Segment)
- .addReg(NextOffsetReg)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, UseFPOffset ? 4 : 0)
+ .add(Segment)
+ .addReg(NextOffsetReg)
+ .setMemRefs(MMOBegin, MMOEnd);
// Jump to endMBB
BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -24608,12 +25413,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Load the overflow_area address into a register.
unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 8)
- .addOperand(Segment)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .setMemRefs(MMOBegin, MMOEnd);
// If we need to align it, do so. Otherwise, just copy the address
// to OverflowDestReg.
@@ -24644,13 +25449,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Store the new overflow address.
BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
- .addOperand(Base)
- .addOperand(Scale)
- .addOperand(Index)
- .addDisp(Disp, 8)
- .addOperand(Segment)
- .addReg(NextAddrReg)
- .setMemRefs(MMOBegin, MMOEnd);
+ .add(Base)
+ .add(Scale)
+ .add(Index)
+ .addDisp(Disp, 8)
+ .add(Segment)
+ .addReg(NextAddrReg)
+ .setMemRefs(MMOBegin, MMOEnd);
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
@@ -24867,7 +25672,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
//
// (CMOV (CMOV F, T, cc1), T, cc2)
//
- // to two successives branches. For that, we look for another CMOV as the
+ // to two successive branches. For that, we look for another CMOV as the
// following instruction.
//
// Without this, we would add a PHI between the two jumps, which ends up
@@ -25123,12 +25928,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
// instruction using the same address operands.
if (Operand.isReg())
Operand.setIsKill(false);
- MIB.addOperand(Operand);
+ MIB.add(Operand);
}
MachineInstr *FOpMI = MIB;
MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
@@ -25331,7 +26136,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
// Emit CALLSEQ_START right before the instruction.
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
MachineInstrBuilder CallseqStart =
- BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+ BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
// Emit CALLSEQ_END right after the instruction.
@@ -25414,6 +26219,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
@@ -25430,7 +26236,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
DstReg = MI.getOperand(CurOp++).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
- assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+ (void)TRI;
unsigned mainDstReg = MRI.createVirtualRegister(RC);
unsigned restoreDstReg = MRI.createVirtualRegister(RC);
@@ -25508,7 +26315,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
else
- MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+ MIB.add(MI.getOperand(MemOpndSlot + i));
}
if (!UseImmLabel)
MIB.addReg(LabelReg);
@@ -25591,7 +26398,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
// Reload FP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload IP
MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
@@ -25599,7 +26406,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), LabelOffset);
else
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Reload SP
@@ -25608,7 +26415,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
if (i == X86::AddrDisp)
MIB.addDisp(MI.getOperand(i), SPOffset);
else
- MIB.addOperand(MI.getOperand(i));
+ MIB.add(MI.getOperand(i));
}
MIB.setMemRefs(MMOBegin, MMOEnd);
// Jump
@@ -25625,7 +26432,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
MVT PVT = getPointerTy(MF->getDataLayout());
assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
@@ -25644,8 +26451,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
VR = MRI->createVirtualRegister(TRC);
Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
- /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
-
if (Subtarget.is64Bit())
BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
.addReg(X86::RIP)
@@ -25655,7 +26460,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
.addReg(0);
else
BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
- .addReg(0) /* XII->getGlobalBaseReg(MF) */
+ .addReg(0) /* TII->getGlobalBaseReg(MF) */
.addImm(1)
.addReg(0)
.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
@@ -25677,7 +26482,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
MachineFrameInfo &MFI = MF->getFrameInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
int FI = MFI.getFunctionContextIndex();
// Get a mapping of the call site numbers to all of the landing pads they're
@@ -25749,9 +26554,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
- const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
- const X86RegisterInfo &RI = XII->getRegisterInfo();
-
+ const X86RegisterInfo &RI = TII->getRegisterInfo();
// Add a register mask with no preserved registers. This results in all
// registers being marked as clobbered.
if (RI.hasBasePointer(*MF)) {
@@ -25799,8 +26602,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
// N.B. the order the invoke BBs are processed in doesn't matter here.
SmallVector<MachineBasicBlock *, 64> MBBLPads;
- const MCPhysReg *SavedRegs =
- Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+ const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
for (MachineBasicBlock *MBB : InvokeBBs) {
// Remove the landing pad successor from the invoke block and replace it
// with the new dispatch block.
@@ -26033,6 +26835,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
case X86::MONITORX:
return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+
+ // Cache line zero
+ case X86::CLZERO:
+ return emitClzero(&MI, BB, Subtarget);
+
// PKU feature
case X86::WRPKRU:
return emitWRPKRU(MI, BB, Subtarget);
@@ -26068,6 +26875,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
+ case TargetOpcode::PATCHABLE_EVENT_CALL:
+ // Do nothing here, handle in xray instrumentation pass.
+ return BB;
+
case X86::LCMPXCHG8B: {
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
@@ -26135,12 +26946,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
//===----------------------------------------------------------------------===//
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
+ KnownBits &Known,
+ const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth) const {
- unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned BitWidth = Known.getBitWidth();
unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
Opc == ISD::INTRINSIC_WO_CHAIN ||
Opc == ISD::INTRINSIC_W_CHAIN ||
@@ -26148,7 +26960,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
"Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!");
- KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
+ Known.resetAll();
switch (Opc) {
default: break;
case X86ISD::ADD:
@@ -26167,44 +26979,101 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
LLVM_FALLTHROUGH;
case X86ISD::SETCC:
- KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+ Known.Zero.setBitsFrom(1);
break;
case X86ISD::MOVMSK: {
unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
- KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+ Known.Zero.setBitsFrom(NumLoBits);
+ break;
+ }
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
+ Known.setAllZero();
+ break;
+ }
+
+ DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+ unsigned ShAmt = ShiftImm->getZExtValue();
+ if (Opc == X86ISD::VSHLI) {
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
+ // Low bits are known zero.
+ Known.Zero.setLowBits(ShAmt);
+ } else {
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+ // High bits are known zero.
+ Known.Zero.setHighBits(ShAmt);
+ }
+ }
break;
}
case X86ISD::VZEXT: {
SDValue N0 = Op.getOperand(0);
- unsigned NumElts = Op.getValueType().getVectorNumElements();
- unsigned InNumElts = N0.getValueType().getVectorNumElements();
- unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
-
- KnownZero = KnownOne = APInt(InBitWidth, 0);
- APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
- DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
- KnownOne = KnownOne.zext(BitWidth);
- KnownZero = KnownZero.zext(BitWidth);
- KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+ unsigned NumElts = VT.getVectorNumElements();
+
+ EVT SrcVT = N0.getValueType();
+ unsigned InNumElts = SrcVT.getVectorNumElements();
+ unsigned InBitWidth = SrcVT.getScalarSizeInBits();
+ assert(InNumElts >= NumElts && "Illegal VZEXT input");
+
+ Known = KnownBits(InBitWidth);
+ APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
+ DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
+ Known = Known.zext(BitWidth);
+ Known.Zero.setBitsFrom(InBitWidth);
break;
}
}
}
unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
- SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
- // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
- if (Op.getOpcode() == X86ISD::SETCC_CARRY)
- return Op.getScalarValueSizeInBits();
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned VTBits = Op.getScalarValueSizeInBits();
+ unsigned Opcode = Op.getOpcode();
+ switch (Opcode) {
+ case X86ISD::SETCC_CARRY:
+ // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+ return VTBits;
- if (Op.getOpcode() == X86ISD::VSEXT) {
- EVT VT = Op.getValueType();
- EVT SrcVT = Op.getOperand(0).getValueType();
- unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
- Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+ case X86ISD::VSEXT: {
+ SDValue Src = Op.getOperand(0);
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ Tmp += VTBits - Src.getScalarValueSizeInBits();
return Tmp;
}
+ case X86ISD::VSHLI: {
+ SDValue Src = Op.getOperand(0);
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ if (ShiftVal.uge(VTBits))
+ return VTBits; // Shifted all bits out --> zero.
+ if (ShiftVal.uge(Tmp))
+ return 1; // Shifted all sign bits out --> unknown.
+ return Tmp - ShiftVal.getZExtValue();
+ }
+
+ case X86ISD::VSRAI: {
+ SDValue Src = Op.getOperand(0);
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ ShiftVal += Tmp;
+ return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+ }
+
+ case X86ISD::PCMPGT:
+ case X86ISD::PCMPEQ:
+ case X86ISD::CMPP:
+ case X86ISD::VPCOM:
+ case X86ISD::VPCOMU:
+ // Vector compares return zero/all-bits result values.
+ return VTBits;
+ }
+
// Fallback case.
return 1;
}
@@ -26228,24 +27097,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
unsigned NumMaskElts = Mask.size();
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
- // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
- if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
- isUndefOrEqual(Mask[0], 0) &&
- isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
- Shuffle = X86ISD::VZEXT_MOVL;
- SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
- return true;
- }
-
- // Match against a VZEXT instruction.
- // TODO: Add 256/512-bit vector support.
- if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+ // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
bool Match = true;
@@ -26255,19 +27117,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
if (Match) {
- SrcVT = MaskVT;
+ unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+ SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
+ if (SrcVT != MaskVT)
+ V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
- Shuffle = X86ISD::VZEXT;
+ Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
+ : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
return true;
}
}
}
+ // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+ if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+ isUndefOrEqual(Mask[0], 0) &&
+ isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+ Shuffle = X86ISD::VZEXT_MOVL;
+ SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+ return true;
+ }
+
// Check if we have SSE3 which will let us use MOVDDUP etc. The
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
- if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+ if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
@@ -26285,7 +27160,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- if (MaskVT.is256BitVector() && FloatDomain) {
+ if (MaskVT.is256BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
@@ -26304,7 +27179,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
}
- if (MaskVT.is512BitVector() && FloatDomain) {
+ if (MaskVT.is512BitVector() && AllowFloatDomain) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -26343,47 +27218,78 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
+ const APInt &Zeroable,
+ bool AllowFloatDomain,
+ bool AllowIntDomain,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
+ unsigned InputSizeInBits = MaskVT.getSizeInBits();
+ unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
+ MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
- bool ContainsZeros = false;
- SmallBitVector Zeroable(NumMaskElts, false);
- for (unsigned i = 0; i != NumMaskElts; ++i) {
- int M = Mask[i];
- Zeroable[i] = isUndefOrZero(M);
- ContainsZeros |= (M == SM_SentinelZero);
- }
+ bool ContainsZeros =
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
- // Attempt to match against byte/bit shifts.
- // FIXME: Add 512-bit support.
- if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskVT.getScalarSizeInBits(), Mask,
- 0, Zeroable, Subtarget);
- if (0 < ShiftAmt) {
- PermuteImm = (unsigned)ShiftAmt;
+ // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
+ if (!ContainsZeros && MaskScalarSizeInBits == 64) {
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ } else if (AllowFloatDomain && Subtarget.hasAVX()) {
+ // VPERMILPD can permute with a non-repeating shuffle.
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
return true;
}
}
- // Ensure we don't contain any zero elements.
- if (ContainsZeros)
- return false;
-
- assert(llvm::all_of(Mask, [&](int M) {
- return SM_SentinelUndef <= M && M < (int)NumMaskElts;
- }) && "Expected unary shuffle");
-
- unsigned InputSizeInBits = MaskVT.getSizeInBits();
- unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
- MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+ // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
+ // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+ // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+ if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
+ !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ // Narrow the repeated mask to create 32-bit element permutes.
+ SmallVector<int, 4> WordMask = RepeatedMask;
+ if (MaskScalarSizeInBits == 64)
+ scaleShuffleMask(2, RepeatedMask, WordMask);
+
+ Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
+ ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+ PermuteImm = getV4X86ShuffleImm(WordMask);
+ return true;
+ }
+ }
- // Handle PSHUFLW/PSHUFHW repeated patterns.
- if (MaskScalarSizeInBits == 16) {
+ // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(Mask.data() + 0, 4);
@@ -26411,110 +27317,59 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
-
- return false;
}
- return false;
}
- // We only support permutation of 32/64 bit elements after this.
- if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
- return false;
-
- // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
- // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
- if (FloatDomain && !Subtarget.hasAVX())
- return false;
-
- // Pre-AVX2 we must use float shuffles on 256-bit vectors.
- if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
- FloatDomain = true;
-
- // Check for lane crossing permutes.
- if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
- // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
- if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
- PermuteImm = getV4X86ShuffleImm(Mask);
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskScalarSizeInBits, Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
return true;
}
- if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
- SmallVector<int, 4> RepeatedMask;
- if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
- PermuteImm = getV4X86ShuffleImm(RepeatedMask);
- return true;
- }
- }
- return false;
- }
-
- // VPERMILPD can permute with a non-repeating shuffle.
- if (FloatDomain && MaskScalarSizeInBits == 64) {
- Shuffle = X86ISD::VPERMILPI;
- ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
- PermuteImm = 0;
- for (int i = 0, e = Mask.size(); i != e; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
- PermuteImm |= (M & 1) << i;
- }
- return true;
}
- // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
- SmallVector<int, 4> RepeatedMask;
- if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
- return false;
-
- // Narrow the repeated mask for 32-bit element permutes.
- SmallVector<int, 4> WordMask = RepeatedMask;
- if (MaskScalarSizeInBits == 64)
- scaleShuffleMask(2, RepeatedMask, WordMask);
-
- Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
- ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
- PermuteImm = getV4X86ShuffleImm(WordMask);
- return true;
+ return false;
}
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain, SDValue &V1, SDValue &V2,
+ bool AllowFloatDomain, bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
if (MaskVT.is128BitVector()) {
- if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVLHPS;
ShuffleVT = MVT::v4f32;
return true;
}
- if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
- (FloatDomain || !Subtarget.hasSSE41())) {
+ (AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
ShuffleVT = MaskVT;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
- (FloatDomain || !Subtarget.hasSSE41())) {
+ (AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
ShuffleVT = MaskVT;
return true;
@@ -26527,57 +27382,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
- MVT LegalVT = MaskVT;
- if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
- LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
-
- SmallVector<int, 64> Unpckl, Unpckh;
- if (IsUnary) {
- createUnpackShuffleMask(MaskVT, Unpckl, true, true);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- V2 = V1;
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- createUnpackShuffleMask(MaskVT, Unpckh, false, true);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- V2 = V1;
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
- } else {
- createUnpackShuffleMask(MaskVT, Unpckl, true, false);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- createUnpackShuffleMask(MaskVT, Unpckh, false, false);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
-
- ShuffleVectorSDNode::commuteMask(Unpckl);
- if (isTargetShuffleEquivalent(Mask, Unpckl)) {
- std::swap(V1, V2);
- Shuffle = X86ISD::UNPCKL;
- ShuffleVT = LegalVT;
- return true;
- }
-
- ShuffleVectorSDNode::commuteMask(Unpckh);
- if (isTargetShuffleEquivalent(Mask, Unpckh)) {
- std::swap(V1, V2);
- Shuffle = X86ISD::UNPCKH;
- ShuffleVT = LegalVT;
- return true;
- }
+ if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
+ DAG, Subtarget)) {
+ ShuffleVT = MaskVT;
+ if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
+ ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+ return true;
}
}
@@ -26585,17 +27395,20 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
- bool FloatDomain,
- SDValue &V1, SDValue &V2,
- SDLoc &DL, SelectionDAG &DAG,
+ const APInt &Zeroable,
+ bool AllowFloatDomain,
+ bool AllowIntDomain,
+ SDValue &V1, SDValue &V2, SDLoc &DL,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
unsigned NumMaskElts = Mask.size();
+ unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
// Attempt to match against PALIGNR byte rotate.
- if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
@@ -26606,77 +27419,69 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to combine to X86ISD::BLENDI.
- if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
- (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
- // Determine a type compatible with X86ISD::BLENDI.
- // TODO - add 16i16 support (requires lane duplication).
- MVT BlendVT = MaskVT;
- if (Subtarget.hasAVX2()) {
- if (BlendVT == MVT::v4i64)
- BlendVT = MVT::v8i32;
- else if (BlendVT == MVT::v2i64)
- BlendVT = MVT::v4i32;
- } else {
- if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
- BlendVT = MVT::v8i16;
- else if (BlendVT == MVT::v4i64)
- BlendVT = MVT::v4f64;
- else if (BlendVT == MVT::v8i32)
- BlendVT = MVT::v8f32;
- }
-
- unsigned BlendSize = BlendVT.getVectorNumElements();
- unsigned MaskRatio = BlendSize / NumMaskElts;
-
- // Can we blend with zero?
- if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
- /*Low*/ 0) &&
- NumMaskElts <= BlendVT.getVectorNumElements()) {
- PermuteImm = 0;
- for (unsigned i = 0; i != BlendSize; ++i)
- if (Mask[i / MaskRatio] < 0)
- PermuteImm |= 1u << i;
-
- V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
- Shuffle = X86ISD::BLENDI;
- ShuffleVT = BlendVT;
- return true;
- }
-
- // Attempt to match as a binary blend.
- if (NumMaskElts <= BlendVT.getVectorNumElements()) {
- bool MatchBlend = true;
- for (int i = 0; i != (int)NumMaskElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- else if (M == SM_SentinelZero)
- MatchBlend = false;
- else if ((M != i) && (M != (i + (int)NumMaskElts)))
- MatchBlend = false;
- }
+ if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+ (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+ (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+ uint64_t BlendMask = 0;
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+ if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
+ BlendMask)) {
+ if (MaskVT == MVT::v16i16) {
+ // We can only use v16i16 PBLENDW if the lanes are repeated.
+ SmallVector<int, 8> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+ RepeatedMask)) {
+ assert(RepeatedMask.size() == 8 &&
+ "Repeated mask size doesn't match!");
+ PermuteImm = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 8)
+ PermuteImm |= 1 << i;
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ Shuffle = X86ISD::BLENDI;
+ ShuffleVT = MaskVT;
+ return true;
+ }
+ } else {
+ // Determine a type compatible with X86ISD::BLENDI.
+ ShuffleVT = MaskVT;
+ if (Subtarget.hasAVX2()) {
+ if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v8i32;
+ else if (ShuffleVT == MVT::v2i64)
+ ShuffleVT = MVT::v4i32;
+ } else {
+ if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+ ShuffleVT = MVT::v8i16;
+ else if (ShuffleVT == MVT::v4i64)
+ ShuffleVT = MVT::v4f64;
+ else if (ShuffleVT == MVT::v8i32)
+ ShuffleVT = MVT::v8f32;
+ }
- if (MatchBlend) {
- PermuteImm = 0;
- for (unsigned i = 0; i != BlendSize; ++i)
- if ((int)NumMaskElts <= Mask[i / MaskRatio])
- PermuteImm |= 1u << i;
+ if (!ShuffleVT.isFloatingPoint()) {
+ int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+ BlendMask =
+ scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+ ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+ }
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+ PermuteImm = (unsigned)BlendMask;
Shuffle = X86ISD::BLENDI;
- ShuffleVT = BlendVT;
return true;
}
}
}
// Attempt to combine to INSERTPS.
- if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
- SmallBitVector Zeroable(4, false);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (Mask[i] < 0)
- Zeroable[i] = true;
-
- if (Zeroable.any() &&
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector()) {
+ if (Zeroable.getBoolValue() &&
matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
@@ -26685,22 +27490,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to combine to SHUFPD.
- if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
- (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
- (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+ if (AllowFloatDomain && EltSizeInBits == 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
Shuffle = X86ISD::SHUFP;
- ShuffleVT = MaskVT;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
}
}
// Attempt to combine to SHUFPS.
- if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
- (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
- (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+ if (AllowFloatDomain && EltSizeInBits == 32 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
SmallVector<int, 4> RepeatedMask;
if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ // Match each half of the repeated mask, to determine if its just
+ // referencing one of the vectors, is zeroable or entirely undef.
auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
int M0 = RepeatedMask[Offset];
int M1 = RepeatedMask[Offset + 1];
@@ -26732,7 +27541,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
V1 = Lo;
V2 = Hi;
Shuffle = X86ISD::SHUFP;
- ShuffleVT = MaskVT;
+ ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
PermuteImm = getV4X86ShuffleImm(ShufMask);
return true;
}
@@ -26764,7 +27573,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// here, we're not going to remove the operands we find.
bool UnaryShuffle = (Inputs.size() == 1);
SDValue V1 = peekThroughBitcasts(Inputs[0]);
- SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+ SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+ : peekThroughBitcasts(Inputs[1]));
MVT VT1 = V1.getSimpleValueType();
MVT VT2 = V2.getSimpleValueType();
@@ -26853,6 +27663,18 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MVT ShuffleSrcVT, ShuffleVT;
unsigned Shuffle, PermuteImm;
+ // Which shuffle domains are permitted?
+ // Permit domain crossing at higher combine depths.
+ bool AllowFloatDomain = FloatDomain || (Depth > 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
+ (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
+
+ // Determine zeroable mask elements.
+ APInt Zeroable(NumMaskElts, 0);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (isUndefOrZero(Mask[i]))
+ Zeroable.setBit(i);
+
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
// directly if we don't shuffle the lower element and we shuffle the upper
@@ -26869,8 +27691,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
- ShuffleSrcVT, ShuffleVT)) {
+ if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+ V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26884,8 +27707,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
- Shuffle, ShuffleVT, PermuteImm)) {
+ if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, Subtarget, Shuffle,
+ ShuffleVT, PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26901,8 +27725,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
- Shuffle, ShuffleVT, UnaryShuffle)) {
+ if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+ V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
+ UnaryShuffle)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26918,8 +27743,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
- DAG, Subtarget, Shuffle, ShuffleVT,
+ if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, V1, V2, DL, DAG,
+ Subtarget, Shuffle, ShuffleVT,
PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
@@ -26937,6 +27763,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
+ // Typically from here on, we need an integer version of MaskVT.
+ MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
+
+ // Annoyingly, SSE4A instructions don't map into the above match helpers.
+ if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
+ uint64_t BitLen, BitIdx;
+ if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(IntMaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(IntMaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(IntMaskVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
@@ -26957,9 +27822,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -26988,9 +27851,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -27015,9 +27876,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
V1 = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(V1.getNode());
@@ -27039,12 +27898,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
- SmallBitVector UndefElts(NumMaskElts, false);
+ APInt UndefElts(NumMaskElts, 0);
SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
}
if (M == SM_SentinelZero)
@@ -27076,8 +27935,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
VPermIdx.push_back(Idx);
}
- MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
- SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+ SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -27100,8 +27958,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumLanes = MaskVT.getSizeInBits() / 128;
unsigned NumEltsPerLane = NumMaskElts / NumLanes;
SmallVector<int, 8> VPerm2Idx;
- MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
- MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
unsigned M2ZImm = 0;
for (int M : Mask) {
if (M == SM_SentinelUndef) {
@@ -27121,7 +27977,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(MaskVT, V2);
DCI.AddToWorklist(V2.getNode());
- SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+ SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPerm2MaskOp.getNode());
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getConstant(M2ZImm, DL, MVT::i8));
@@ -27228,8 +28084,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
// Extract constant bits from each source op.
bool OneUseConstantOp = false;
- SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
- SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+ SmallVector<APInt, 16> UndefEltsOps(NumOps);
+ SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
for (unsigned i = 0; i != NumOps; ++i) {
SDValue SrcOp = Ops[i];
OneUseConstantOp |= SrcOp.hasOneUse();
@@ -27245,18 +28101,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
return false;
// Shuffle the constant bits according to the mask.
- SmallBitVector UndefElts(NumMaskElts, false);
- SmallBitVector ZeroElts(NumMaskElts, false);
- SmallBitVector ConstantElts(NumMaskElts, false);
+ APInt UndefElts(NumMaskElts, 0);
+ APInt ZeroElts(NumMaskElts, 0);
+ APInt ConstantElts(NumMaskElts, 0);
SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
APInt::getNullValue(MaskSizeInBits));
for (unsigned i = 0; i != NumMaskElts; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
} else if (M == SM_SentinelZero) {
- ZeroElts[i] = true;
+ ZeroElts.setBit(i);
continue;
}
assert(0 <= M && M < (int)(NumMaskElts * NumOps));
@@ -27266,21 +28122,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
if (SrcUndefElts[SrcMaskIdx]) {
- UndefElts[i] = true;
+ UndefElts.setBit(i);
continue;
}
auto &SrcEltBits = RawBitsOps[SrcOpIdx];
APInt &Bits = SrcEltBits[SrcMaskIdx];
if (!Bits) {
- ZeroElts[i] = true;
+ ZeroElts.setBit(i);
continue;
}
- ConstantElts[i] = true;
+ ConstantElts.setBit(i);
ConstantBitData[i] = Bits;
}
- assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+ assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
// Create the constant data.
MVT MaskSVT;
@@ -27330,6 +28186,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
int SrcOpIndex, SDValue Root,
ArrayRef<int> RootMask,
+ ArrayRef<const SDNode*> SrcNodes,
int Depth, bool HasVariableMask,
SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -27353,13 +28210,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
- SDValue Input0, Input1;
- SmallVector<int, 16> OpMask;
- if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+ SmallVector<int, 64> OpMask;
+ SmallVector<SDValue, 2> OpInputs;
+ if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
return false;
+ assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
+ SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
+ SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
+
// Add the inputs to the Ops list, avoiding duplicates.
- SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+ SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
int InputIdx0 = -1, InputIdx1 = -1;
for (int i = 0, e = Ops.size(); i < e; ++i) {
@@ -27385,52 +28246,70 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
OpMask.size() % RootMask.size() == 0) ||
OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.");
- int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
- int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
- int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
- assert(((RootRatio == 1 && OpRatio == 1) ||
- (RootRatio == 1) != (OpRatio == 1)) &&
+
+ // This function can be performance-critical, so we rely on the power-of-2
+ // knowledge that we have about the mask sizes to replace div/rem ops with
+ // bit-masks and shifts.
+ assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+ unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+ assert((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!");
- SmallVector<int, 16> Mask;
- Mask.reserve(MaskWidth);
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+ SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
// Merge this shuffle operation's mask into our accumulated mask. Note that
// this shuffle's mask will be the first applied to the input, followed by the
// root mask to get us all the way to the root value arrangement. The reason
// for this order is that we are recursing up the operation chain.
- for (int i = 0; i < MaskWidth; ++i) {
- int RootIdx = i / RootRatio;
+ for (unsigned i = 0; i < MaskWidth; ++i) {
+ unsigned RootIdx = i >> RootRatioLog2;
if (RootMask[RootIdx] < 0) {
// This is a zero or undef lane, we're done.
- Mask.push_back(RootMask[RootIdx]);
+ Mask[i] = RootMask[RootIdx];
continue;
}
- int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+ unsigned RootMaskedIdx =
+ RootRatio == 1
+ ? RootMask[RootIdx]
+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
// Just insert the scaled root mask value if it references an input other
// than the SrcOp we're currently inserting.
if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
- Mask.push_back(RootMaskedIdx);
+ Mask[i] = RootMaskedIdx;
continue;
}
- RootMaskedIdx %= MaskWidth;
-
- int OpIdx = RootMaskedIdx / OpRatio;
+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
if (OpMask[OpIdx] < 0) {
// The incoming lanes are zero or undef, it doesn't matter which ones we
// are using.
- Mask.push_back(OpMask[OpIdx]);
+ Mask[i] = OpMask[OpIdx];
continue;
}
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
- int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
- OpMaskedIdx %= MaskWidth;
+ unsigned OpMaskedIdx =
+ OpRatio == 1
+ ? OpMask[OpIdx]
+ : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
if (OpMask[OpIdx] < (int)OpMask.size()) {
assert(0 <= InputIdx0 && "Unknown target shuffle input");
OpMaskedIdx += InputIdx0 * MaskWidth;
@@ -27439,7 +28318,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
OpMaskedIdx += InputIdx1 * MaskWidth;
}
- Mask.push_back(OpMaskedIdx);
+ Mask[i] = OpMaskedIdx;
}
// Handle the all undef/zero cases early.
@@ -27457,28 +28336,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
}
// Remove unused shuffle source ops.
- SmallVector<SDValue, 8> UsedOps;
- for (int i = 0, e = Ops.size(); i < e; ++i) {
- int lo = UsedOps.size() * MaskWidth;
- int hi = lo + MaskWidth;
- if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
- UsedOps.push_back(Ops[i]);
- continue;
- }
- for (int &M : Mask)
- if (lo <= M)
- M -= MaskWidth;
- }
- assert(!UsedOps.empty() && "Shuffle with no inputs detected");
- Ops = UsedOps;
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ assert(!Ops.empty() && "Shuffle with no inputs detected");
HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
- // See if we can recurse into each shuffle source op (if it's a target shuffle).
+ // Update the list of shuffle nodes that have been combined so far.
+ SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+ SrcNodes.end());
+ CombinedNodes.push_back(Op.getNode());
+
+ // See if we can recurse into each shuffle source op (if it's a target
+ // shuffle). The source op should only be combined if it either has a
+ // single use (i.e. current Op) or all its users have already been combined.
for (int i = 0, e = Ops.size(); i < e; ++i)
- if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
- if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
- HasVariableMask, DAG, DCI, Subtarget))
+ if (Ops[i].getNode()->hasOneUse() ||
+ SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+ if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+ Depth + 1, HasVariableMask, DAG, DCI,
+ Subtarget))
return true;
// Attempt to constant fold all of the constant source ops.
@@ -27495,7 +28371,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- SmallVector<int, 16> WidenedMask;
+ SmallVector<int, 64> WidenedMask;
while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
Mask = std::move(WidenedMask);
}
@@ -27561,8 +28437,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
/// altering anything.
static SDValue
combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
- SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG) {
assert(N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!");
SDLoc DL(N);
@@ -27842,19 +28717,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
case X86ISD::MOVSD:
case X86ISD::MOVSS: {
- bool isFloat = VT.isFloatingPoint();
SDValue V0 = peekThroughBitcasts(N->getOperand(0));
SDValue V1 = peekThroughBitcasts(N->getOperand(1));
- bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
- bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
- assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+ if (isZero0 && isZero1)
+ return SDValue();
// We often lower to MOVSD/MOVSS from integer as well as native float
// types; remove unnecessary domain-crossing bitcasts if we can to make it
// easier to combine shuffles later on. We've already accounted for the
// domain switching cost when we decided to lower with it.
+ bool isFloat = VT.isFloatingPoint();
+ bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+ bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
@@ -28025,7 +28901,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
break;
case X86ISD::PSHUFD:
- if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
return NewN;
break;
@@ -28173,12 +29049,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
-
- // Don't create instructions with illegal types after legalize types has run.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
- return SDValue();
-
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
if (TLI.isTypeLegal(VT))
@@ -28249,11 +29120,19 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
SmallVector<SDValue, 16> Elts;
- for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
- Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ Elts.push_back(Elt);
+ continue;
+ }
+ Elts.clear();
+ break;
+ }
- if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
- return LD;
+ if (Elts.size() == VT.getVectorNumElements())
+ if (SDValue LD =
+ EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
+ return LD;
// For AVX2, we sometimes want to combine
// (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -28276,7 +29155,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// a particular chain.
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
@@ -28303,18 +29182,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EVT OriginalVT = InVec.getValueType();
- if (InVec.getOpcode() == ISD::BITCAST) {
- // Don't duplicate a load with other uses.
- if (!InVec.hasOneUse())
- return SDValue();
- EVT BCVT = InVec.getOperand(0).getValueType();
- if (!BCVT.isVector() ||
- BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
- return SDValue();
- InVec = InVec.getOperand(0);
- }
+ // Peek through bitcasts, don't duplicate a load with other uses.
+ InVec = peekThroughOneUseBitcasts(InVec);
EVT CurrentVT = InVec.getValueType();
+ if (!CurrentVT.isVector() ||
+ CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ return SDValue();
if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();
@@ -28389,23 +29263,151 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
EltNo);
}
+// Try to match patterns such as
+// (i16 bitcast (v16i1 x))
+// ->
+// (i16 movmsk (16i8 sext (v16i1 x)))
+// before the illegal vector is scalarized on subtargets that don't have legal
+// vxi1 types.
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+ const X86Subtarget &Subtarget) {
+ EVT VT = BitCast.getValueType();
+ SDValue N0 = BitCast.getOperand(0);
+ EVT VecVT = N0->getValueType(0);
+
+ if (!VT.isScalarInteger() || !VecVT.isSimple())
+ return SDValue();
+
+ // With AVX512 vxi1 types are legal and we prefer using k-regs.
+ // MOVMSK is supported in SSE2 or later.
+ if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
+ return SDValue();
+
+ // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
+ // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
+ // v8i16 and v16i16.
+ // For these two cases, we can shuffle the upper element bytes to a
+ // consecutive sequence at the start of the vector and treat the results as
+ // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
+ // for v16i16 this is not the case, because the shuffle is expensive, so we
+ // avoid sign-extending to this type entirely.
+ // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
+ // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
+ MVT SExtVT;
+ MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+ switch (VecVT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i1:
+ SExtVT = MVT::v2i64;
+ FPCastVT = MVT::v2f64;
+ break;
+ case MVT::v4i1:
+ SExtVT = MVT::v4i32;
+ FPCastVT = MVT::v4f32;
+ // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
+ // sign-extend to a 256-bit operation to avoid truncation.
+ if (N0->getOpcode() == ISD::SETCC &&
+ N0->getOperand(0)->getValueType(0).is256BitVector() &&
+ Subtarget.hasInt256()) {
+ SExtVT = MVT::v4i64;
+ FPCastVT = MVT::v4f64;
+ }
+ break;
+ case MVT::v8i1:
+ SExtVT = MVT::v8i16;
+ // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
+ // sign-extend to a 256-bit operation to match the compare.
+ // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
+ // 256-bit because the shuffle is cheaper than sign extending the result of
+ // the compare.
+ if (N0->getOpcode() == ISD::SETCC &&
+ N0->getOperand(0)->getValueType(0).is256BitVector() &&
+ Subtarget.hasInt256()) {
+ SExtVT = MVT::v8i32;
+ FPCastVT = MVT::v8f32;
+ }
+ break;
+ case MVT::v16i1:
+ SExtVT = MVT::v16i8;
+ // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
+ // it is not profitable to sign-extend to 256-bit because this will
+ // require an extra cross-lane shuffle which is more expensive than
+ // truncating the result of the compare to 128-bits.
+ break;
+ case MVT::v32i1:
+ // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
+ if (!Subtarget.hasInt256())
+ return SDValue();
+ SExtVT = MVT::v32i8;
+ break;
+ };
+
+ SDLoc DL(BitCast);
+ SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
+ if (SExtVT == MVT::v8i16) {
+ V = DAG.getBitcast(MVT::v16i8, V);
+ V = DAG.getVectorShuffle(
+ MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
+ {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+ } else
+ assert(SExtVT.getScalarType() != MVT::i16 &&
+ "Vectors of i16 must be shuffled");
+ if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ V = DAG.getBitcast(FPCastVT, V);
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+ return DAG.getZExtOrTrunc(V, DL, VT);
+}
+
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+
+ // Try to match patterns such as
+ // (i16 bitcast (v16i1 x))
+ // ->
+ // (i16 movmsk (16i8 sext (v16i1 x)))
+ // before the setcc result is scalarized on subtargets that don't have legal
+ // vxi1 types.
+ if (DCI.isBeforeLegalize())
+ if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+ return V;
+ // Since MMX types are special and don't usually play with other vector types,
+ // it's better to handle them early to be sure we emit efficient code by
+ // avoiding store-load conversions.
- // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
- // special and don't usually play with other vector types, it's better to
- // handle them early to be sure we emit efficient code by avoiding
- // store-load conversions.
+ // Detect bitcasts between i32 to x86mmx low word.
if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
- N0.getValueType() == MVT::v2i32 &&
- isNullConstant(N0.getOperand(1))) {
+ SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
SDValue N00 = N0->getOperand(0);
if (N00.getValueType() == MVT::i32)
return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
}
+ // Detect bitcasts between element or subvector extraction to x86mmx.
+ if (VT == MVT::x86mmx &&
+ (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+ isNullConstant(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N00.getValueType().is128BitVector())
+ return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+ DAG.getBitcast(MVT::v2i64, N00));
+ }
+
+ // Detect bitcasts from FP_TO_SINT to x86mmx.
+ if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
+ N0.getOpcode() == ISD::FP_TO_SINT) {
+ SDLoc DL(N0);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+ DAG.getBitcast(MVT::v2i64, Res));
+ }
+
// Convert a bitcasted integer logic operation that has one bitcasted
// floating-point operand into a floating-point logic operation. This may
// create a load of a constant, but that is cheaper than materializing the
@@ -28511,12 +29513,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
if (SetCC.getOpcode() != ISD::SETCC)
return false;
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
- if (CC != ISD::SETGT)
+ if (CC != ISD::SETGT && CC != ISD::SETLT)
return false;
SDValue SelectOp1 = Select->getOperand(1);
SDValue SelectOp2 = Select->getOperand(2);
+ // The following instructions assume SelectOp1 is the subtraction operand
+ // and SelectOp2 is the negation operand.
+ // In the case of SETLT this is the other way around.
+ if (CC == ISD::SETLT)
+ std::swap(SelectOp1, SelectOp2);
+
// The second operand of the select should be the negation of the first
// operand, which is implemented as 0 - SelectOp1.
if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -28529,8 +29537,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
if (SetCC.getOperand(0) != SelectOp1)
return false;
- // The second operand of the comparison can be either -1 or 0.
- if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+ // In SetLT case, The second operand of the comparison can be either 1 or 0.
+ APInt SplatVal;
+ if ((CC == ISD::SETLT) &&
+ !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal,
+ /*AllowShrink*/false) &&
+ SplatVal.isOneValue()) ||
+ (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+ return false;
+
+ // In SetGT case, The second operand of the comparison can be either -1 or 0.
+ if ((CC == ISD::SETGT) &&
+ !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
return false;
@@ -28576,17 +29594,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}
+// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+static SDValue combineHorizontalPredicateResult(SDNode *Extract,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE2 or with AVX512VL (which uses predicate registers).
+ if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ unsigned BitWidth = ExtractVT.getSizeInBits();
+ if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+ ExtractVT != MVT::i8)
+ return SDValue();
+
+ // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+ for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
+ SDValue Match = matchBinOpReduction(Extract, Op);
+ if (!Match)
+ continue;
+
+ // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+ // which we can't support here for now.
+ if (Match.getScalarValueSizeInBits() != BitWidth)
+ continue;
+
+ // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 &&
+ ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+ return SDValue();
+
+ // Don't bother performing this for 2-element vectors.
+ if (Match.getValueType().getVectorNumElements() <= 2)
+ return SDValue();
+
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
+
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ APInt CompareBits;
+ ISD::CondCode CondCode;
+ if (Op == ISD::OR) {
+ // any_of -> MOVMSK != 0
+ CompareBits = APInt::getNullValue(32);
+ CondCode = ISD::CondCode::SETNE;
+ } else {
+ // all_of -> MOVMSK == ((1 << NumElts) - 1)
+ CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CondCode = ISD::CondCode::SETEQ;
+ }
+
+ // Perform the select as i32/i64 and then truncate to avoid partial register
+ // stalls.
+ unsigned ResWidth = std::max(BitWidth, 32u);
+ EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+ SDLoc DL(Extract);
+ SDValue Zero = DAG.getConstant(0, DL, ResVT);
+ SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+ SDValue Res = DAG.getBitcast(MaskVT, Match);
+ Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+ Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+ Ones, Zero, CondCode);
+ return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ }
+
+ return SDValue();
+}
+
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// PSADBW is only supported on SSE2 and up.
if (!Subtarget.hasSSE2())
return SDValue();
- // Verify the type we're extracting from is appropriate
- // TODO: There's nothing special about i32, any integer type above i16 should
- // work just as well.
+ // Verify the type we're extracting from is any integer type above i16.
EVT VT = Extract->getOperand(0).getValueType();
- if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+ if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
return SDValue();
unsigned RegSize = 128;
@@ -28595,15 +29688,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
else if (Subtarget.hasAVX2())
RegSize = 256;
- // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+ // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
// TODO: We should be able to handle larger vectors by splitting them before
// feeding them into several SADs, and then reducing over those.
- if (VT.getSizeInBits() / 4 > RegSize)
+ if (RegSize / VT.getVectorNumElements() < 8)
return SDValue();
// Match shuffle + add pyramid.
SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+ // The operand is expected to be zero extended from i8
+ // (verified in detectZextAbsDiff).
+ // In order to convert to i64 and above, additional any/zero/sign
+ // extend is expected.
+ // The zero extend from 32 bit has no mathematical effect on the result.
+ // Also the sign extend is basically zero extend
+ // (extends the sign bit which is zero).
+ // So it is correct to skip the sign/zero extend instruction.
+ if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
+ Root = Root.getOperand(0);
+
// If there was a match, we want Root to be a select that is the root of an
// abs-diff pattern.
if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -28614,7 +29720,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
if (!detectZextAbsDiff(Root, Zext0, Zext1))
return SDValue();
- // Create the SAD instruction
+ // Create the SAD instruction.
SDLoc DL(Extract);
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
@@ -28636,13 +29742,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
}
}
- // Return the lowest i32.
- MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+ MVT Type = Extract->getSimpleValueType(0);
+ unsigned TypeSizeInBits = Type.getSizeInBits();
+ // Return the lowest TypeSizeInBits bits.
+ MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
Extract->getOperand(1));
}
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Src = N->getOperand(0);
+ SDValue Idx = N->getOperand(1);
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+ // Don't attempt this for boolean mask vectors or unknown extraction indices.
+ if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ // Resolve the target shuffle inputs and mask.
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
+ return SDValue();
+
+ // Attempt to narrow/widen the shuffle mask to the correct size.
+ if (Mask.size() != NumSrcElts) {
+ if ((NumSrcElts % Mask.size()) == 0) {
+ SmallVector<int, 16> ScaledMask;
+ int Scale = NumSrcElts / Mask.size();
+ scaleShuffleMask(Scale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
+ } else if ((Mask.size() % NumSrcElts) == 0) {
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > NumSrcElts &&
+ canWidenShuffleElements(Mask, WidenedMask))
+ Mask = std::move(WidenedMask);
+ // TODO - investigate support for wider shuffle masks with known upper
+ // undef/zero elements for implicit zero-extension.
+ }
+ }
+
+ // Check if narrowing/widening failed.
+ if (Mask.size() != NumSrcElts)
+ return SDValue();
+
+ int SrcIdx = Mask[N->getConstantOperandVal(1)];
+ SDLoc dl(N);
+
+ // If the shuffle source element is undef/zero then we can just accept it.
+ if (SrcIdx == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+
+ if (SrcIdx == SM_SentinelZero)
+ return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+ : DAG.getConstant(0, dl, VT);
+
+ SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+ SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+ SrcIdx = SrcIdx % Mask.size();
+
+ // We can only extract other elements from 128-bit vectors and in certain
+ // circumstances, depending on SSE-level.
+ // TODO: Investigate using extract_subvector for larger vectors.
+ // TODO: Investigate float/double extraction if it will be just stored.
+ if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+ ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+ assert(SrcSVT == VT && "Unexpected extraction type");
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+ DAG.getIntPtrConstant(SrcIdx, dl));
+ }
+
+ if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+ assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
+ "Unexpected extraction type");
+ unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+ SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+ DAG.getIntPtrConstant(SrcIdx, dl));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
+ DAG.getValueType(SrcSVT));
+ return DAG.getZExtOrTrunc(Assert, dl, VT);
+ }
+
+ return SDValue();
+}
+
/// Detect vector gather/scatter index generation and convert it from being a
/// bunch of shuffles and extracts into a somewhat faster sequence.
/// For i686, the best sequence is apparently storing the value and loading
@@ -28653,14 +29849,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ return NewOp;
+
SDValue InputVector = N->getOperand(0);
+ SDValue EltIdx = N->getOperand(1);
+
+ EVT SrcVT = InputVector.getValueType();
+ EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
+
+ // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+ SDValue MMXSrc = InputVector.getOperand(0);
+
+ // The bitcast source is a direct mmx result.
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getBitcast(VT, InputVector);
+ }
+
// Detect mmx to i32 conversion through a v2i32 elt extract.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
- N->getValueType(0) == MVT::i32 &&
- InputVector.getValueType() == MVT::v2i32 &&
- isa<ConstantSDNode>(N->getOperand(1)) &&
- N->getConstantOperandVal(1) == 0) {
+ VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);
// The bitcast source is a direct mmx result.
@@ -28668,15 +29879,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
}
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
- InputVector.getOpcode() == ISD::BITCAST &&
+ if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
+ isa<ConstantSDNode>(EltIdx) &&
isa<ConstantSDNode>(InputVector.getOperand(0))) {
- uint64_t ExtractedElt =
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- uint64_t InputValue =
- cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ uint64_t ExtractedElt = N->getConstantOperandVal(1);
+ uint64_t InputValue = InputVector.getConstantOperandVal(0);
uint64_t Res = (InputValue >> ExtractedElt) & 1;
return DAG.getConstant(Res, dl, MVT::i1);
}
@@ -28687,9 +29894,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
return SAD;
+ // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+ if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+ return Cmp;
+
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
- if (InputVector.getValueType() != MVT::v4i32)
+ if (SrcVT != MVT::v4i32)
return SDValue();
// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
@@ -28717,9 +29928,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Record which element was extracted.
- ExtractedElements |=
- 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
-
+ ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
Uses.push_back(Extract);
}
@@ -28752,11 +29961,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
} else {
// Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
MachinePointerInfo());
- EVT ElementType = InputVector.getValueType().getVectorElementType();
+ EVT ElementType = SrcVT.getVectorElementType();
unsigned EltSize = ElementType.getSizeInBits() / 8;
// Replace each use (extract) with a load of the appropriate element.
@@ -28779,8 +29988,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
UE = Uses.end(); UI != UE; ++UI) {
SDNode *Extract = *UI;
- SDValue Idx = Extract->getOperand(1);
- uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ uint64_t IdxVal = Extract->getConstantOperandVal(1);
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
}
@@ -28788,6 +29996,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// TODO - merge with combineExtractVectorElt once it can handle the implicit
+// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
+// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+// combineBasicSADPattern.
+static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
+}
+
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
static SDValue
@@ -28812,12 +30030,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
// This situation only applies to avx512.
if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
CondVT.getVectorElementType() == MVT::i1) {
- //Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
- DL, CondVT));
- //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+ // Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
+ DAG.getAllOnesConstant(DL, CondVT));
+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
}
// To use the condition operand as a bitwise mask, it must have elements that
@@ -28920,18 +30137,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(ShAmt, DL, MVT::i8));
}
- // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
- if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
- return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
- SDValue(FalseC, 0));
- }
-
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
@@ -28939,7 +30144,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
if (N->getValueType(0) == MVT::i32)
Diff = (unsigned)Diff;
- bool isFastMultiplier = false;
+ bool IsFastMultiplier = false;
if (Diff < 10) {
switch ((unsigned char)Diff) {
default:
@@ -28951,12 +30156,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
- isFastMultiplier = true;
+ IsFastMultiplier = true;
break;
}
}
- if (isFastMultiplier) {
+ if (IsFastMultiplier) {
APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -29049,7 +30254,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return false;
MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
// Only change element size, not type.
- if (VT.isInteger() != OpEltVT.isInteger())
+ if (EltVT.isInteger() != OpEltVT.isInteger())
return false;
uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29063,7 +30268,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
DCI.AddToWorklist(Op1.getNode());
DCI.CombineTo(OrigOp.getNode(),
DAG.getNode(Opcode, DL, VT, Op0, Op1,
- DAG.getConstant(Imm, DL, MVT::i8)));
+ DAG.getIntPtrConstant(Imm, DL)));
return true;
}
case ISD::EXTRACT_SUBVECTOR: {
@@ -29072,7 +30277,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return false;
MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
// Only change element size, not type.
- if (VT.isInteger() != OpEltVT.isInteger())
+ if (EltVT.isInteger() != OpEltVT.isInteger())
return false;
uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29084,7 +30289,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
DCI.AddToWorklist(Op0.getNode());
DCI.CombineTo(OrigOp.getNode(),
DAG.getNode(Opcode, DL, VT, Op0,
- DAG.getConstant(Imm, DL, MVT::i8)));
+ DAG.getIntPtrConstant(Imm, DL)));
+ return true;
+ }
+ case X86ISD::SUBV_BROADCAST: {
+ unsigned EltSize = EltVT.getSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ // Only change element size, not type.
+ if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+ return false;
+ SDValue Op0 = Op.getOperand(0);
+ MVT Op0VT = MVT::getVectorVT(EltVT,
+ Op0.getSimpleValueType().getSizeInBits() / EltSize);
+ Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
+ DCI.AddToWorklist(Op0.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0));
return true;
}
}
@@ -29147,6 +30368,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
case ISD::SETOLT:
case ISD::SETLT:
case ISD::SETLE:
@@ -29177,6 +30399,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
case ISD::SETOGT:
case ISD::SETGT:
case ISD::SETGE:
@@ -29211,6 +30434,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a min would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
case ISD::SETOGT:
case ISD::SETGT:
case ISD::SETGE:
@@ -29239,6 +30463,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle both negative zeros and NaNs
// incorrectly, but we can swap the operands to fix both.
std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
case ISD::SETOLT:
case ISD::SETLT:
case ISD::SETLE:
@@ -29296,7 +30521,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
Cond.getOperand(0), Cond.getOperand(1), NewCC);
- return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+ return DAG.getSelect(DL, VT, Cond, LHS, RHS);
}
}
}
@@ -29355,7 +30580,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// x s< 0 ? x^C : 0 --> subus x, C
if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
- OpRHSConst->getAPIntValue().isSignBit())
+ OpRHSConst->getAPIntValue().isSignMask())
// Note that we have to rebuild the RHS constant here to ensure we
// don't rely on particular values of undef lanes.
return DAG.getNode(
@@ -29370,8 +30595,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// If this is a *dynamic* select (non-constant condition) and we can match
// this node with one of the variable blend instructions, restructure the
- // condition so that the blends can use the high bit of each element and use
- // SimplifyDemandedBits to simplify the condition operand.
+ // condition so that blends can use the high (sign) bit of each element and
+ // use SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
@@ -29404,51 +30629,49 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Byte blends are only available in AVX2
if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
return SDValue();
+ // There are no 512-bit blend instructions that use sign bits.
+ if (VT.is512BitVector())
+ return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
- APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
-
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
- DCI.isBeforeLegalizeOps());
- if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
- TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
- TLO)) {
- // If we changed the computation somewhere in the DAG, this change
- // will affect all users of Cond.
- // Make sure it is fine and update all the nodes so that we do not
- // use the generic VSELECT anymore. Otherwise, we may perform
- // wrong optimizations as we messed up with the actual expectation
+ APInt DemandedMask(APInt::getSignMask(BitWidth));
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Make sure it is fine and update all the nodes
+ // so that we do not use the generic VSELECT anymore. Otherwise, we may
+ // perform wrong optimizations as we messed with the actual expectation
// for the vector boolean values.
if (Cond != TLO.Old) {
- // Check all uses of that condition operand to check whether it will be
- // consumed by non-BLEND instructions, which may depend on all bits are
- // set properly.
- for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
- I != E; ++I)
- if (I->getOpcode() != ISD::VSELECT)
- // TODO: Add other opcodes eventually lowered into BLEND.
+ // Check all uses of the condition operand to check whether it will be
+ // consumed by non-BLEND instructions. Those may require that all bits
+ // are set properly.
+ for (SDNode *U : Cond->uses()) {
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ if (U->getOpcode() != ISD::VSELECT)
return SDValue();
+ }
- // Update all the users of the condition, before committing the change,
- // so that the VSELECT optimizations that expect the correct vector
- // boolean value will not be triggered.
- for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
- I != E; ++I)
- DAG.ReplaceAllUsesOfValueWith(
- SDValue(*I, 0),
- DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
- Cond, I->getOperand(1), I->getOperand(2)));
+ // Update all users of the condition before committing the change, so
+ // that the VSELECT optimizations that expect the correct vector boolean
+ // value will not be triggered.
+ for (SDNode *U : Cond->uses()) {
+ SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
+ U->getValueType(0), Cond, U->getOperand(1),
+ U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+ }
DCI.CommitTargetLoweringOpt(TLO);
return SDValue();
}
- // At this point, only Cond is changed. Change the condition
- // just for N to keep the opportunity to optimize all other
- // users their own way.
- DAG.ReplaceAllUsesOfValueWith(
- SDValue(N, 0),
- DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
- TLO.New, N->getOperand(1), N->getOperand(2)));
+ // Only Cond (rather than other nodes in the computation chain) was
+ // changed. Change the condition just for N to keep the opportunity to
+ // optimize all other users their own way.
+ SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
return SDValue();
}
}
@@ -29456,7 +30679,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Look for vselects with LHS/RHS being bitcasted from an operation that
// can be executed on another type. Push the bitcast to the inputs of
// the operation. This exposes opportunities for using masking instructions.
- if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+ if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
CondVT.getVectorElementType() == MVT::i1) {
if (combineBitcastForMaskedOp(LHS, DAG, DCI))
return SDValue(N, 0);
@@ -29464,6 +30687,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
+ // Custom action for SELECT MMX
+ if (VT == MVT::x86mmx) {
+ LHS = DAG.getBitcast(MVT::i64, LHS);
+ RHS = DAG.getBitcast(MVT::i64, RHS);
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, newSelect);
+ }
+
return SDValue();
}
@@ -29711,11 +30942,40 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
return true;
}
+// When legalizing carry, we create carries via add X, -1
+// If that comes from an actual carry, via setcc, we use the
+// carry directly.
+static SDValue combineCarryThroughADD(SDValue EFLAGS) {
+ if (EFLAGS.getOpcode() == X86ISD::ADD) {
+ if (isAllOnesConstant(EFLAGS.getOperand(1))) {
+ SDValue Carry = EFLAGS.getOperand(0);
+ while (Carry.getOpcode() == ISD::TRUNCATE ||
+ Carry.getOpcode() == ISD::ZERO_EXTEND ||
+ Carry.getOpcode() == ISD::SIGN_EXTEND ||
+ Carry.getOpcode() == ISD::ANY_EXTEND ||
+ (Carry.getOpcode() == ISD::AND &&
+ isOneConstant(Carry.getOperand(1))))
+ Carry = Carry.getOperand(0);
+ if (Carry.getOpcode() == X86ISD::SETCC ||
+ Carry.getOpcode() == X86ISD::SETCC_CARRY) {
+ if (Carry.getConstantOperandVal(0) == X86::COND_B)
+ return Carry.getOperand(1);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
SelectionDAG &DAG) {
+ if (CC == X86::COND_B)
+ if (SDValue Flags = combineCarryThroughADD(EFLAGS))
+ return Flags;
+
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
return combineSetCCAtomicArith(EFLAGS, CC, DAG);
@@ -30141,6 +31401,77 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
}
}
+static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
+ EVT VT, SDLoc DL) {
+
+ auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
+ SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(Mult, DL, VT));
+ Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+ DAG.getConstant(Shift, DL, MVT::i8));
+ Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+ N->getOperand(0));
+ return Result;
+ };
+
+ auto combineMulMulAddOrSub = [&](bool isAdd) {
+ SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(9, DL, VT));
+ Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+ Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+ N->getOperand(0));
+ return Result;
+ };
+
+ switch (MulAmt) {
+ default:
+ break;
+ case 11:
+ // mul x, 11 => add ((shl (mul x, 5), 1), x)
+ return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
+ case 21:
+ // mul x, 21 => add ((shl (mul x, 5), 2), x)
+ return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+ case 22:
+ // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
+ return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
+ case 19:
+ // mul x, 19 => sub ((shl (mul x, 5), 2), x)
+ return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+ case 13:
+ // mul x, 13 => add ((shl (mul x, 3), 2), x)
+ return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
+ case 23:
+ // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+ return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
+ case 14:
+ // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
+ return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
+ case 26:
+ // mul x, 26 => sub ((mul (mul x, 9), 3), x)
+ return combineMulMulAddOrSub(/*isAdd*/ false);
+ case 28:
+ // mul x, 28 => add ((mul (mul x, 9), 3), x)
+ return combineMulMulAddOrSub(/*isAdd*/ true);
+ case 29:
+ // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
+ return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ combineMulMulAddOrSub(/*isAdd*/ true));
+ case 30:
+ // mul x, 30 => sub (sub ((shl x, 5), x), x)
+ return DAG.getNode(
+ ISD::SUB, DL, VT,
+ DAG.getNode(ISD::SUB, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(5, DL, MVT::i8)),
+ N->getOperand(0)),
+ N->getOperand(0));
+ }
+ return SDValue();
+}
+
/// Optimize a single multiply with constant into two operations in order to
/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -30150,6 +31481,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalize() && VT.isVector())
return reduceVMULWidth(N, DAG, Subtarget);
+ if (!MulConstantOptimization)
+ return SDValue();
// An imul is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
@@ -30205,25 +31538,41 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, DL, VT));
- }
+ } else if (!Subtarget.slowLEA())
+ NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
if (!NewMul) {
- assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
- && "Both cases that could cause potential overflows should have "
- "already been handled.");
- if (isPowerOf2_64(MulAmt - 1))
- // (mul x, 2^N + 1) => (add (shl x, N), x)
- NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt - 1), DL,
- MVT::i8)));
-
- else if (isPowerOf2_64(MulAmt + 1))
- // (mul x, 2^N - 1) => (sub (shl x, N), x)
- NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
- N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt + 1),
- DL, MVT::i8)), N->getOperand(0));
+ assert(MulAmt != 0 &&
+ MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+ "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ int64_t SignMulAmt = C->getSExtValue();
+ if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
+ (SignMulAmt != -INT64_MAX)) {
+ int NumSign = SignMulAmt > 0 ? 1 : -1;
+ bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
+ bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
+ if (IsPowerOf2_64PlusOne) {
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
+ MVT::i8)));
+ } else if (IsPowerOf2_64MinusOne) {
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(
+ ISD::SUB, DL, VT,
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
+ MVT::i8)),
+ N->getOperand(0));
+ }
+ // To negate, subtract the number from zero
+ if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
+ NewMul =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+ }
}
if (NewMul)
@@ -30246,8 +31595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
- const APInt &ShAmt = N1C->getAPIntValue();
- Mask = Mask.shl(ShAmt);
+ Mask <<= N1C->getAPIntValue();
bool MaskOK = false;
// We can handle cases concerning bit-widening nodes containing setcc_c if
// we carefully interrogate the mask to make sure we are semantics
@@ -30396,42 +31744,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
- "Unexpected opcode");
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
+ X86ISD::VSRLI == Opcode) &&
+ "Unexpected shift opcode");
+ bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
- // This fails for mask register (vXi1) shifts.
- if ((NumBitsPerElt % 8) != 0)
- return SDValue();
+ assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+ "Unexpected value type");
// Out of range logical bit shifts are guaranteed to be zero.
- APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
- if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
- return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ // Out of range arithmetic bit shifts splat the sign bit.
+ APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
+ if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
+ if (LogicalShift)
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ else
+ ShiftVal = NumBitsPerElt - 1;
+ }
// Shift N0 by zero -> N0.
if (!ShiftVal)
- return N->getOperand(0);
+ return N0;
// Shift zero -> zero.
- if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ if (ISD::isBuildVectorAllZeros(N0.getNode()))
return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+ // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
+ // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
+ // TODO - support other sra opcodes as needed.
+ if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
+ N0.getOpcode() == X86ISD::VSRAI)
+ return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+
// We can decode 'whole byte' logical bit shifts as shuffles.
- if ((ShiftVal.getZExtValue() % 8) == 0) {
+ if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
}
+ // Constant Folding.
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (N->isOnlyUserOf(N0.getNode()) &&
+ getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
+ assert(EltBits.size() == VT.getVectorNumElements() &&
+ "Unexpected shift value type");
+ unsigned ShiftImm = ShiftVal.getZExtValue();
+ for (APInt &Elt : EltBits) {
+ if (X86ISD::VSHLI == Opcode)
+ Elt <<= ShiftImm;
+ else if (X86ISD::VSRAI == Opcode)
+ Elt.ashrInPlace(ShiftImm);
+ else
+ Elt.lshrInPlace(ShiftImm);
+ }
+ return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ assert(
+ ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
+ (N->getOpcode() == X86ISD::PINSRW &&
+ N->getValueType(0) == MVT::v8i16)) &&
+ "Unexpected vector insertion");
+
+ // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget);
return SDValue();
}
@@ -30495,13 +31896,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
// See X86ATTInstPrinter.cpp:printSSECC().
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget.hasAVX512()) {
- SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
- CMP01,
- DAG.getConstant(x86cc, DL, MVT::i8));
- if (N->getValueType(0) != MVT::i1)
- return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
- FSetCC);
- return FSetCC;
+ SDValue FSetCC =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
+ DAG.getConstant(x86cc, DL, MVT::i8));
+ return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
+ FSetCC, DAG.getIntPtrConstant(0, DL));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
@@ -30550,33 +31949,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
return SDValue();
- // Canonicalize XOR to the left.
- if (N1.getOpcode() == ISD::XOR)
- std::swap(N0, N1);
-
- if (N0.getOpcode() != ISD::XOR)
- return SDValue();
+ if (N0.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
- SDValue N00 = N0->getOperand(0);
- SDValue N01 = N0->getOperand(1);
-
- N01 = peekThroughBitcasts(N01);
-
- // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
- // insert_subvector building a 256-bit AllOnes vector.
- if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
- if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
- return SDValue();
+ if (N1.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
- SDValue V1 = N01->getOperand(0);
- SDValue V2 = N01->getOperand(1);
- if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
- !V1.getOperand(0).isUndef() ||
- !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
- !ISD::isBuildVectorAllOnes(V2.getNode()))
- return SDValue();
- }
- return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+ return SDValue();
}
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -30696,38 +32077,35 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
-/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
-/// eliminate loading the vector constant mask value. This relies on the fact
-/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
-static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
- // TODO: Use AssertSext to mark any nodes that have the property of producing
- // all-ones or all-zeros. Then check for that node rather than particular
- // opcodes.
- if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+ if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
return SDValue();
- // The existence of the PCMP node guarantees that we have the required SSE2 or
- // AVX2 for a shift of this vector type, but there is no vector shift by
- // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
- // masked compare nodes, so they should not make it here.
- EVT VT0 = Op0.getValueType();
- EVT VT1 = Op1.getValueType();
- unsigned EltBitWidth = VT0.getScalarSizeInBits();
- if (VT0 != VT1 || EltBitWidth == 8)
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal,
+ /*AllowShrink*/false) ||
+ !SplatVal.isMask())
return SDValue();
- assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+ if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+ return SDValue();
- APInt SplatVal;
- if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+ unsigned EltBitWidth = VT0.getScalarSizeInBits();
+ if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
return SDValue();
SDLoc DL(N);
- SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+ unsigned ShiftVal = SplatVal.countTrailingOnes();
+ SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
@@ -30747,7 +32125,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
- if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+ if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
EVT VT = N->getValueType(0);
@@ -30760,7 +32138,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue Op(N, 0);
SmallVector<int, 1> NonceMask; // Just a placeholder.
NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
/*Depth*/ 1, /*HasVarMask*/ false, DAG,
DCI, Subtarget))
return SDValue(); // This routine will use CombineTo to replace N.
@@ -30802,20 +32180,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// (sub (xor X, M), M)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- assert(N->getOpcode() == ISD::OR);
+ assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
- if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+ if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (VT.is256BitVector() && Subtarget.hasInt256())))
return SDValue();
- assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
- // Canonicalize pandn to RHS
- if (N0.getOpcode() == X86ISD::ANDNP)
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
std::swap(N0, N1);
+ // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+ // ANDNP combine allows other combines to happen that prevent matching.
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
return SDValue();
@@ -30837,21 +32217,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
Y = peekThroughBitcasts(Y);
EVT MaskVT = Mask.getValueType();
-
- // Validate that the Mask operand is a vector sra node.
- // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
- // there is no psrai.b
unsigned EltBits = MaskVT.getScalarSizeInBits();
- unsigned SraAmt = ~0;
- if (Mask.getOpcode() == ISD::SRA) {
- if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
- if (auto *AmtConst = AmtBV->getConstantSplatNode())
- SraAmt = AmtConst->getZExtValue();
- } else if (Mask.getOpcode() == X86ISD::VSRAI) {
- SDValue SraC = Mask.getOperand(1);
- SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
- }
- if ((SraAmt + 1) != EltBits)
+
+ // TODO: Attempt to handle floating point cases as well?
+ if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
return SDValue();
SDLoc DL(N);
@@ -30872,7 +32241,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
// (add (xor X, M), (and M, 1))
// And further to:
// (sub (xor X, M), M)
- if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+ if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+ DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
auto IsNegV = [](SDNode *N, SDValue V) {
return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -30884,7 +32254,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
V = Y;
if (V) {
- assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
SDValue SubOp2 = Mask;
@@ -30901,8 +32270,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
if (V == Y)
std::swap(SubOp1, SubOp2);
- return DAG.getBitcast(VT,
- DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
}
}
@@ -30915,7 +32284,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
X = DAG.getBitcast(BlendVT, X);
Y = DAG.getBitcast(BlendVT, Y);
Mask = DAG.getBitcast(BlendVT, Mask);
- Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+ Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
return DAG.getBitcast(VT, Mask);
}
@@ -30969,7 +32338,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
N->getOperand(1).getOpcode() == X86ISD::CMP &&
- N->getOperand(1).getConstantOperandVal(1) == 0 &&
+ isNullConstant(N->getOperand(1).getOperand(1)) &&
N->getOperand(1).getValueType().bitsGE(MVT::i32);
};
@@ -31272,6 +32641,75 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasAVX512())
+ return false;
+
+ // FIXME: Scalar type may be supported if we move it to vector register.
+ if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+ return false;
+
+ EVT SrcElVT = SrcVT.getScalarType();
+ EVT DstElVT = DstVT.getScalarType();
+ if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+ return false;
+ if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+ return false;
+ if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+ return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+ return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
+ if (In.getOpcode() != ISD::UMIN)
+ return SDValue();
+
+ //Saturation with truncation. We truncate from InVT to VT.
+ assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
+ "Unexpected types for truncate operation");
+
+ APInt C;
+ if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C,
+ /*AllowShrink*/false)) {
+ // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+ // the element size of the destination type.
+ return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
+ SDValue();
+ }
+ return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+ const X86Subtarget &Subtarget) {
+ if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return SDValue();
+ return detectUSatPattern(In, VT);
+}
+
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
+ return SDValue();
+ if (auto USatVal = detectUSatPattern(In, VT))
+ if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ return SDValue();
+}
+
/// This function detects the AVG pattern between vectors of unsigned i8/i16,
/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
/// X86ISD::AVG instruction.
@@ -31327,8 +32765,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
if (!BV || !BV->isConstant())
return false;
- for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+ for (SDValue Op : V->ops()) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return false;
uint64_t Val = C->getZExtValue();
@@ -31403,15 +32841,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
- // into two 16-byte operations.
+ // into two 16-byte operations. Also split non-temporal aligned loads on
+ // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
- TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
- AddressSpace, Alignment, &Fast) && !Fast) {
+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+ (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ AddressSpace, Alignment, &Fast) && !Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
@@ -31664,7 +33104,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WideSrc0,
Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD);
- SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
}
@@ -31838,6 +33278,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
+ if (SDValue Val =
+ detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+ return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
assert(StVT != VT && "Cannot truncate to the same type");
@@ -31975,7 +33421,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
- SDValue NewChain = NewLd.getValue(1);
+ // Make sure new load is placed in same chain order.
+ SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
if (TokenFactorIndex >= 0) {
Ops.push_back(NewChain);
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
@@ -31996,11 +33443,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
Ld->getPointerInfo().getWithOffset(4),
MinAlign(Ld->getAlignment(), 4),
Ld->getMemOperand()->getFlags());
+ // Make sure new loads are placed in same chain order.
+ SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
+ NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
- SDValue NewChain = LoLd.getValue(1);
if (TokenFactorIndex >= 0) {
- Ops.push_back(LoLd);
- Ops.push_back(HiLd);
+ Ops.push_back(NewChain);
NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
}
@@ -32198,13 +33646,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
- auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
- // TODO: Add extra cases where we can truncate both inputs for the
- // cost of one (or none).
- // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
+ unsigned TruncSizeInBits = VT.getScalarSizeInBits();
+
+ // Repeated operand, so we are only trading one output truncation for
+ // one input truncation.
if (Op0 == Op1)
return true;
+ // See if either operand has been extended from a smaller/equal size to
+ // the truncation size, allowing a truncation to combine with the extend.
+ unsigned Opcode0 = Op0.getOpcode();
+ if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
+ Opcode0 == ISD::ZERO_EXTEND) &&
+ Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ return true;
+
+ unsigned Opcode1 = Op1.getOpcode();
+ if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
+ Opcode1 == ISD::ZERO_EXTEND) &&
+ Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+ return true;
+
+ // See if either operand is a single use constant which can be constant
+ // folded.
SDValue BC0 = peekThroughOneUseBitcasts(Op0);
SDValue BC1 = peekThroughOneUseBitcasts(Op1);
return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
@@ -32236,7 +33701,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
- IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ IsRepeatedOpOrFreeTruncation(Op0, Op1))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -32252,7 +33717,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
- IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ IsRepeatedOpOrFreeTruncation(Op0, Op1))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -32458,6 +33923,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
+ // Try to combine truncation with unsigned saturation.
+ if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+ return Val;
+
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -32494,8 +33963,8 @@ static SDValue isFNEG(SDNode *N) {
SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
unsigned EltBits = Op1.getScalarValueSizeInBits();
- auto isSignBitValue = [&](const ConstantFP *C) {
- return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
+ auto isSignMask = [&](const ConstantFP *C) {
+ return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
};
// There is more than one way to represent the same constant on
@@ -32506,21 +33975,21 @@ static SDValue isFNEG(SDNode *N) {
// We check all variants here.
if (Op1.getOpcode() == X86ISD::VBROADCAST) {
if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
- if (isSignBitValue(cast<ConstantFP>(C)))
+ if (isSignMask(cast<ConstantFP>(C)))
return Op0;
} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
- if (isSignBitValue(CN->getConstantFPValue()))
+ if (isSignMask(CN->getConstantFPValue()))
return Op0;
} else if (auto *C = getTargetConstantFromNode(Op1)) {
if (C->getType()->isVectorTy()) {
if (auto *SplatV = C->getSplatValue())
- if (isSignBitValue(cast<ConstantFP>(SplatV)))
+ if (isSignMask(cast<ConstantFP>(SplatV)))
return Op0;
} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
- if (isSignBitValue(FPConst))
+ if (isSignMask(FPConst))
return Op0;
}
return SDValue();
@@ -32545,7 +34014,7 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
// use of a constant by performing (-0 - A*B) instead.
// FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
- Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
+ Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
Arg.getOperand(1), Zero);
@@ -32800,8 +34269,35 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
// are NaN, the NaN value of Op1 is the result.
- auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
- return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+ return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+/// Do target-specific dag combines on X86ISD::ANDNP nodes.
+static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ // ANDNP(0, x) -> x
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return N->getOperand(1);
+
+ // ANDNP(x, 0) -> 0
+ if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+ return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+
+ EVT VT = N->getValueType(0);
+
+ // Attempt to recursively combine a bitmask ANDNP with shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ SmallVector<int, 1> NonceMask; // Just a placeholder.
+ NonceMask.push_back(0);
+ if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+ /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+ DCI, Subtarget))
+ return SDValue(); // This routine will use CombineTo to replace N.
+ }
+
+ return SDValue();
}
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
@@ -32811,12 +34307,12 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
if (Op1.hasOneUse()) {
unsigned BitWidth = Op1.getValueSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
- APInt KnownZero, KnownOne;
+ KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
- TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
DCI.CommitTargetLoweringOpt(TLO);
}
return SDValue();
@@ -32878,8 +34374,8 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
return SDValue();
bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
- bool NSW = Add->getFlags()->hasNoSignedWrap();
- bool NUW = Add->getFlags()->hasNoUnsignedWrap();
+ bool NSW = Add->getFlags().hasNoSignedWrap();
+ bool NUW = Add->getFlags().hasNoUnsignedWrap();
// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
// into the 'zext'
@@ -32919,7 +34415,7 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
SDNodeFlags Flags;
Flags.setNoSignedWrap(NSW);
Flags.setNoUnsignedWrap(NUW);
- return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
}
/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
@@ -33065,13 +34561,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps()) {
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue AllOnes =
- DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
- return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+ return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
}
return SDValue();
}
+ if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
+ isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
+ // Invert and sign-extend a boolean is the same as zero-extend and subtract
+ // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
+ // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
+ // sext (xor Bool, -1) --> sub (zext Bool), 1
+ SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
+ }
+
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -33212,8 +34717,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Optimize x == -y --> x+y == 0
-/// x != -y --> x+y != 0
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+ assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+ // We're looking for an oversized integer equality comparison, but ignore a
+ // comparison with zero because that gets special treatment in EmitTest().
+ SDValue X = SetCC->getOperand(0);
+ SDValue Y = SetCC->getOperand(1);
+ EVT OpVT = X.getValueType();
+ unsigned OpSize = OpVT.getSizeInBits();
+ if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+ return SDValue();
+
+ // TODO: Use PXOR + PTEST for SSE4.1 or later?
+ // TODO: Add support for AVX-512.
+ EVT VT = SetCC->getValueType(0);
+ SDLoc DL(SetCC);
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX2())) {
+ EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+ SDValue VecX = DAG.getBitcast(VecVT, X);
+ SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+ // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+ // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+ // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+ // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
+ // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+ SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+ SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+ SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
+ MVT::i32);
+ return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+ }
+
+ return SDValue();
+}
+
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -33222,21 +34766,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDLoc DL(N);
- if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
- if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
- LHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
+ if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+ EVT OpVT = LHS.getValueType();
+ // 0-x == y --> x+y == 0
+ // 0-x != y --> x+y != 0
+ if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+ LHS.hasOneUse()) {
+ SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
+ return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
- if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
- if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
- RHS.getOperand(1));
- return DAG.getSetCC(DL, N->getValueType(0), addV,
- DAG.getConstant(0, DL, addV.getValueType()), CC);
+ // x == 0-y --> x+y == 0
+ // x != 0-y --> x+y != 0
+ if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+ RHS.hasOneUse()) {
+ SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+ return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}
+ if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+ return V;
+ }
+
if (VT.getScalarType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
bool IsSEXT0 =
@@ -33293,56 +34843,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Helper function of performSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
-// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
- SelectionDAG &DAG, MVT VT) {
- if (VT == MVT::i8)
- return DAG.getNode(ISD::AND, DL, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- EFLAGS),
- DAG.getConstant(1, DL, VT));
- assert (VT == MVT::i1 && "Unexpected type for SECCC node");
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- EFLAGS));
-}
-
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
- if (CC == X86::COND_A) {
- // Try to convert COND_A into COND_B in an attempt to facilitate
- // materializing "setb reg".
- //
- // Do not flip "e > c", where "c" is a constant, because Cmp instruction
- // cannot take an immediate as its first operand.
- //
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
- EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
- return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
- }
- }
-
- // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
- // a zext and produces an all-ones bit which is more useful than 0/1 in some
- // cases.
- if (CC == X86::COND_B)
- return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
-
// Try to simplify the EFLAGS and condition code operands.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
return getSETCC(CC, Flags, DL, DAG);
@@ -33352,7 +34859,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
/// Optimize branch condition evaluation.
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
SDValue EFLAGS = N->getOperand(3);
@@ -33512,6 +35018,18 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
+ N->getOperand(0), N->getOperand(1),
+ Flags);
+ }
+
+ return SDValue();
+}
+
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -33535,48 +35053,237 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return DCI.CombineTo(N, Res1, CarryOut);
}
+ if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+ MVT VT = N->getSimpleValueType(0);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
+ N->getOperand(0), N->getOperand(1),
+ Flags);
+ }
+
return SDValue();
}
-/// fold (add Y, (sete X, 0)) -> adc 0, Y
-/// (add Y, (setne X, 0)) -> sbb -1, Y
-/// (sub (sete X, 0), Y) -> sbb 0, Y
-/// (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
+/// which is more useful than 0/1 in some cases.
+static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
SDLoc DL(N);
+ // "Condition code B" is also known as "the carry flag" (CF).
+ SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
+ SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
+ MVT VT = N->getSimpleValueType(0);
+ if (VT == MVT::i8)
+ return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
- // Look through ZExts.
- SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
- if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
- return SDValue();
+ assert(VT == MVT::i1 && "Unexpected type for SETCC node");
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+ bool IsSub = N->getOpcode() == ISD::SUB;
+ SDValue X = N->getOperand(0);
+ SDValue Y = N->getOperand(1);
+
+ // If this is an add, canonicalize a zext operand to the RHS.
+ // TODO: Incomplete? What if both sides are zexts?
+ if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+ Y.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(X, Y);
- SDValue SetCC = Ext.getOperand(0);
- if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+ // Look through a one-use zext.
+ bool PeekedThroughZext = false;
+ if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+ Y = Y.getOperand(0);
+ PeekedThroughZext = true;
+ }
+
+ // If this is an add, canonicalize a setcc operand to the RHS.
+ // TODO: Incomplete? What if both sides are setcc?
+ // TODO: Should we allow peeking through a zext of the other operand?
+ if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+ Y.getOpcode() != X86ISD::SETCC)
+ std::swap(X, Y);
+
+ if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
return SDValue();
- X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+ if (ConstantX) {
+ if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
+ // This is a complicated way to get -1 or 0 from the carry flag:
+ // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ Y.getOperand(1));
+ }
+
+ if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
+ SDValue EFLAGS = Y->getOperand(1);
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Swap the operands of a SUB, and we have the same pattern as above.
+ // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+ // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ NewEFLAGS);
+ }
+ }
+ }
+
+ if (CC == X86::COND_B) {
+ // X + SETB Z --> X + (mask SBB Z, Z)
+ // X - SETB Z --> X - (mask SBB Z, Z)
+ // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
+ SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
+ if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+ SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+ return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+ }
+
+ if (CC == X86::COND_A) {
+ SDValue EFLAGS = Y->getOperand(1);
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
+ if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+ SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+ return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+ }
+ }
+
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
- SDValue Cmp = SetCC.getOperand(1);
+ SDValue Cmp = Y.getOperand(1);
if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
!X86::isZeroNode(Cmp.getOperand(1)) ||
!Cmp.getOperand(0).getValueType().isInteger())
return SDValue();
- SDValue CmpOp0 = Cmp.getOperand(0);
- SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
- DAG.getConstant(1, DL, CmpOp0.getValueType()));
+ SDValue Z = Cmp.getOperand(0);
+ EVT ZVT = Z.getValueType();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ if (ConstantX) {
+ // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+ // fake operands:
+ // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+ // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+ if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+ SDValue Zero = DAG.getConstant(0, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ }
+
+ // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+ // with fake operands:
+ // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+ // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+ if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ }
+ }
+
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+ // Add the flags type for ADC/SBB nodes.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
- SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+ // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+ // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
- return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
- DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
- NewCmp);
- return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
- DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+ DAG.getConstant(-1ULL, DL, VT), Cmp1);
+
+ // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
+ // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+ DAG.getConstant(0, DL, VT), Cmp1);
+}
+
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue MulOp = N->getOperand(0);
+ SDValue Phi = N->getOperand(1);
+
+ if (MulOp.getOpcode() != ISD::MUL)
+ std::swap(MulOp, Phi);
+ if (MulOp.getOpcode() != ISD::MUL)
+ return SDValue();
+
+ ShrinkMode Mode;
+ if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ unsigned RegSize = 128;
+ if (Subtarget.hasBWI())
+ RegSize = 512;
+ else if (Subtarget.hasAVX2())
+ RegSize = 256;
+ unsigned VectorSize = VT.getVectorNumElements() * 16;
+ // If the vector size is less than 128, or greater than the supported RegSize,
+ // do not use PMADD.
+ if (VectorSize < 128 || VectorSize > RegSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements());
+ EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ VT.getVectorNumElements() / 2);
+
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
+ // Madd vector size is half of the original vector size
+ SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+ // Fill the rest of the output with 0
+ SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
}
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -33650,12 +35357,41 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+ assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+ "Unexpected opcode for increment/decrement transform");
+
+ // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+ // out and wait for legalization if we have an unsupported vector length.
+ EVT VT = N->getValueType(0);
+ if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ SDNode *N1 = N->getOperand(1).getNode();
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) ||
+ !SplatVal.isOneValue())
+ return SDValue();
+
+ SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+ unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
- if (Flags->hasVectorReduction()) {
+ const SDNodeFlags Flags = N->getFlags();
+ if (Flags.hasVectorReduction()) {
if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
return Sad;
+ if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+ return MAdd;
}
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
@@ -33667,7 +35403,10 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
- return OptimizeConditionalInDecrement(N, DAG);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
+ return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
@@ -33700,36 +35439,47 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
- return OptimizeConditionalInDecrement(N, DAG);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
+ return combineAddOrSubToADCOrSBB(N, DAG);
}
static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
SDLoc DL(N);
unsigned Opcode = N->getOpcode();
MVT VT = N->getSimpleValueType(0);
MVT SVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
SDValue Op = N->getOperand(0);
MVT OpVT = Op.getSimpleValueType();
MVT OpEltVT = OpVT.getVectorElementType();
- unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+ unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
+ unsigned InputBits = OpEltSizeInBits * NumElts;
// Perform any constant folding.
// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
- if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- unsigned NumDstElts = VT.getVectorNumElements();
- SmallBitVector Undefs(NumDstElts, false);
- SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
- for (unsigned i = 0; i != NumDstElts; ++i) {
- SDValue OpElt = Op.getOperand(i);
- if (OpElt.getOpcode() == ISD::UNDEF) {
- Undefs[i] = true;
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
+ APInt Undefs(NumElts, 0);
+ SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
+ bool IsZEXT =
+ (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (UndefElts[i]) {
+ Undefs.setBit(i);
continue;
}
- APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
- Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
- : Cst.sextOrTrunc(SVT.getSizeInBits());
+ Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
+ : EltBits[i].sextOrTrunc(EltSizeInBits);
}
return getConstVector(Vals, Undefs, VT, DAG, DL);
}
@@ -33829,7 +35579,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
if (N->getOperand(0) == N->getOperand(1)) {
if (N->getOpcode() == X86ISD::PCMPEQ)
- return getOnesVector(VT, Subtarget, DAG, DL);
+ return getOnesVector(VT, DAG, DL);
if (N->getOpcode() == X86ISD::PCMPGT)
return getZeroVector(VT, Subtarget, DAG, DL);
}
@@ -33837,6 +35587,99 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Vec = N->getOperand(0);
+ SDValue SubVec = N->getOperand(1);
+ SDValue Idx = N->getOperand(2);
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = N->getSimpleValueType(0);
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // If this is an insert of an extract, combine to a shuffle. Don't do this
+ // if the insert or extract can be represented with a subvector operation.
+ if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+ (IdxVal != 0 || !Vec.isUndef())) {
+ int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+ if (ExtIdxVal != 0) {
+ int VecNumElts = OpVT.getVectorNumElements();
+ int SubVecNumElts = SubVecVT.getVectorNumElements();
+ SmallVector<int, 64> Mask(VecNumElts);
+ // First create an identity shuffle mask.
+ for (int i = 0; i != VecNumElts; ++i)
+ Mask[i] = i;
+ // Now insert the extracted portion.
+ for (int i = 0; i != SubVecNumElts; ++i)
+ Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+ return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+ }
+ }
+
+ // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+ // load:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr + 16), Elts/2)
+ // --> load32 addr
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr + 32), Elts/2)
+ // --> load64 addr
+ // or a 16-byte or 32-byte broadcast:
+ // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+ // (load16 addr), Elts/2)
+ // --> X86SubVBroadcast(load16 addr)
+ // or:
+ // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+ // (load32 addr), Elts/2)
+ // --> X86SubVBroadcast(load32 addr)
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+ auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+ if (Idx2 && Idx2->getZExtValue() == 0) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ // If needed, look through bitcasts to get to the load.
+ if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+ bool Fast;
+ unsigned Alignment = FirstLd->getAlignment();
+ unsigned AS = FirstLd->getAddressSpace();
+ const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+ if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ OpVT, AS, Alignment, &Fast) && Fast) {
+ SDValue Ops[] = {SubVec2, SubVec};
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
+ Subtarget, false))
+ return Ld;
+ }
+ }
+ // If lower/upper loads are the same and the only users of the load, then
+ // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+ if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+ if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+ SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+ }
+ }
+ // If this is subv_broadcast insert into both halves, use a larger
+ // subv_broadcast.
+ if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+ return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+ SubVec.getOperand(0));
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -33845,13 +35688,19 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
default: break;
case ISD::EXTRACT_VECTOR_ELT:
return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ case X86ISD::PEXTRW:
+ case X86ISD::PEXTRB:
+ return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_SUBVECTOR:
+ return combineInsertSubvector(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
- case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
+ case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
case ISD::ADD: return combineAdd(N, DAG, Subtarget);
case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL:
@@ -33870,6 +35719,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -33884,14 +35734,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
- case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
- case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
+ case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
+ case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
case X86ISD::VSHLI:
- case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
case X86ISD::VSEXT:
case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -33969,14 +35827,21 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
/// we don't adjust the stack we clobber the first frame index.
/// See X86InstrInfo::copyPhysReg.
-bool X86TargetLowering::hasCopyImplyingStackAdjustment(
- MachineFunction *MF) const {
- const MachineRegisterInfo &MRI = MF->getRegInfo();
-
+static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
return any_of(MRI.reg_instructions(X86::EFLAGS),
[](const MachineInstr &RI) { return RI.isCopy(); });
}
+void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
+ if (hasCopyImplyingStackAdjustment(MF)) {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ MFI.setHasCopyImplyingStackAdjustment(true);
+ }
+
+ TargetLoweringBase::finalizeLowering(MF);
+}
+
/// This method query the target whether it is beneficial for dag combiner to
/// promote the specified node. If true, it should return the desired promotion
/// type by reference.
@@ -34217,6 +36082,7 @@ TargetLowering::ConstraintWeight
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+ LLVM_FALLTHROUGH;
case 'R':
case 'q':
case 'Q':
@@ -34568,6 +36434,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::GR64RegClass);
break;
}
+ LLVM_FALLTHROUGH;
// 32-bit fallthrough
case 'Q': // Q_REGS
if (VT == MVT::i32 || VT == MVT::f32)
@@ -34737,7 +36604,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
// turn into {ax},{dx}.
// MVT::Other is used to specify clobber names.
- if (Res.second->hasType(VT) || VT == MVT::Other)
+ if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
return Res; // Correct type already, nothing to do.
// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
@@ -34775,11 +36642,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
Res.second = &X86::FR32RegClass;
else if (VT == MVT::f64 || VT == MVT::i64)
Res.second = &X86::FR64RegClass;
- else if (X86::VR128RegClass.hasType(VT))
+ else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
Res.second = &X86::VR128RegClass;
- else if (X86::VR256RegClass.hasType(VT))
+ else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
Res.second = &X86::VR256RegClass;
- else if (X86::VR512RegClass.hasType(VT))
+ else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
Res.second = &X86::VR512RegClass;
else {
// Type mismatch and not a clobber: Return an error;
@@ -34819,7 +36686,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on x86 is expensive. However, when aggressively optimizing
// for code size, we prefer to use a div instruction, as it is usually smaller
// than the alternative sequence.
@@ -34827,8 +36694,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
- Attribute::MinSize);
+ bool OptSize =
+ Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}
@@ -34884,3 +36751,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // If the function specifically requests stack probes, emit them.
+ if (MF.getFunction()->hasFnAttribute("probe-stack"))
+ return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+ // Generally, if we aren't on Windows, the platform ABI does not include
+ // support for stack probes, so don't emit them.
+ if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+ return "";
+
+ // We need a stack probe to conform to the Windows ABI. Choose the right
+ // symbol.
+ if (Subtarget.is64Bit())
+ return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+ return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
OpenPOWER on IntegriCloud