summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp10606
1 files changed, 4680 insertions, 5926 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index 85978d8..e3ec288 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,6 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/VariadicFunction.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -33,6 +32,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
@@ -67,169 +67,17 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
"rather than promotion."),
cl::Hidden);
-static cl::opt<bool> ExperimentalVectorShuffleLowering(
- "x86-experimental-vector-shuffle-lowering", cl::init(true),
- cl::desc("Enable an experimental vector shuffle lowering code path."),
- cl::Hidden);
-
-static cl::opt<bool> ExperimentalVectorShuffleLegality(
- "x86-experimental-vector-shuffle-legality", cl::init(false),
- cl::desc("Enable experimental shuffle legality based on the experimental "
- "shuffle lowering. Should only be used with the experimental "
- "shuffle lowering."),
- cl::Hidden);
-
-static cl::opt<int> ReciprocalEstimateRefinementSteps(
- "x86-recip-refinement-steps", cl::init(1),
- cl::desc("Specify the number of Newton-Raphson iterations applied to the "
- "result of the hardware reciprocal estimate instruction."),
- cl::NotHidden);
-
// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
-static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl,
- unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- unsigned Factor = VT.getSizeInBits()/vectorWidth;
- EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
- VT.getVectorNumElements()/Factor);
-
- // Extract from UNDEF is UNDEF.
- if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getUNDEF(ResultVT);
-
- // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
- unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
- * ElemsPerChunk);
-
- // If the input is a buildvector just emit a smaller one.
- if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
- ElemsPerChunk));
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
-}
-
-/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
-/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
-/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
-/// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert((Vec.getValueType().is256BitVector() ||
- Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
-}
-
-/// Generate a DAG to grab 256-bits from a 512-bit vector.
-static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
- return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
-}
-
-static SDValue InsertSubVector(SDValue Result, SDValue Vec,
- unsigned IdxVal, SelectionDAG &DAG,
- SDLoc dl, unsigned vectorWidth) {
- assert((vectorWidth == 128 || vectorWidth == 256) &&
- "Unsupported vector width");
- // Inserting UNDEF is Result
- if (Vec.getOpcode() == ISD::UNDEF)
- return Result;
- EVT VT = Vec.getValueType();
- EVT ElVT = VT.getVectorElementType();
- EVT ResultVT = Result.getValueType();
-
- // Insert the relevant vectorWidth bits.
- unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
-
- // This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
- * ElemsPerChunk);
-
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
-}
-
-/// Generate a DAG to put 128-bits into a vector > 128 bits. This
-/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
-/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
-/// simple superregister reference. Idx is an index in the 128 bits
-/// we want. It need not be aligned to a 128-bit boundary. That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG,SDLoc dl) {
- assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
-}
-
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
- SelectionDAG &DAG, SDLoc dl) {
- assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
- return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
-}
-
-/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
-/// instructions. This is used because creating CONCAT_VECTOR nodes of
-/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
-/// large BUILD_VECTORS.
-static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
-static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- SDLoc dl) {
- SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
- return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
-}
-
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
TD = getDataLayout();
- resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
- const TargetMachine &TM = getTargetMachine();
- static bool FirstTimeThrough = true;
-
- // If none of the target options have changed, then we don't need to reset the
- // operation actions.
- if (!FirstTimeThrough && TO == TM.Options) return;
-
- if (!FirstTimeThrough) {
- // Reinitialize the actions.
- initActions();
- FirstTimeThrough = false;
- }
-
- TO = TM.Options;
-
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -247,8 +95,7 @@ void X86TargetLowering::resetOperationActions() {
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo =
- TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2.
@@ -330,7 +177,7 @@ void X86TargetLowering::resetOperationActions() {
if (Subtarget->is64Bit()) {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
- } else if (!TM.Options.UseSoftFloat) {
+ } else if (!Subtarget->useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
@@ -344,7 +191,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
- if (!TM.Options.UseSoftFloat) {
+ if (!Subtarget->useSoftFloat()) {
// SSE has no i16 to fp conversion, only i32
if (X86ScalarSSEf32) {
setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
@@ -387,7 +234,7 @@ void X86TargetLowering::resetOperationActions() {
if (Subtarget->is64Bit()) {
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
- } else if (!TM.Options.UseSoftFloat) {
+ } else if (!Subtarget->useSoftFloat()) {
// Since AVX is a superset of SSE3, only check for SSE here.
if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
// Expand FP_TO_UINT into a select.
@@ -515,7 +362,7 @@ void X86TargetLowering::resetOperationActions() {
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+ if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -660,7 +507,11 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
- if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
+ // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
+ setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
+ setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
+
+ if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -694,7 +545,7 @@ void X86TargetLowering::resetOperationActions() {
// cases we handle.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
addLegalFPImmediate(APFloat(+0.0f)); // xorps
- } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
+ } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -729,7 +580,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FCOS , MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
}
- } else if (!TM.Options.UseSoftFloat) {
+ } else if (!Subtarget->useSoftFloat()) {
// f32 and f64 in x87.
// Set up the FP register classes.
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
@@ -763,7 +614,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::FMA, MVT::f32, Expand);
// Long double always uses X87.
- if (!TM.Options.UseSoftFloat) {
+ if (!Subtarget->useSoftFloat()) {
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -893,49 +744,35 @@ void X86TargetLowering::resetOperationActions() {
// them legal.
if (VT.getVectorElementType() == MVT::i1)
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+
+ // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
+ // split/scalarized right now.
+ if (VT.getVectorElementType() == MVT::f16)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
}
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
- if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) {
addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
// MMX-sized vectors (other than x86mmx) are expected to be expanded
// into smaller operations.
- setOperationAction(ISD::MULHS, MVT::v8i8, Expand);
- setOperationAction(ISD::MULHS, MVT::v4i16, Expand);
- setOperationAction(ISD::MULHS, MVT::v2i32, Expand);
- setOperationAction(ISD::MULHS, MVT::v1i64, Expand);
- setOperationAction(ISD::AND, MVT::v8i8, Expand);
- setOperationAction(ISD::AND, MVT::v4i16, Expand);
- setOperationAction(ISD::AND, MVT::v2i32, Expand);
- setOperationAction(ISD::AND, MVT::v1i64, Expand);
- setOperationAction(ISD::OR, MVT::v8i8, Expand);
- setOperationAction(ISD::OR, MVT::v4i16, Expand);
- setOperationAction(ISD::OR, MVT::v2i32, Expand);
- setOperationAction(ISD::OR, MVT::v1i64, Expand);
- setOperationAction(ISD::XOR, MVT::v8i8, Expand);
- setOperationAction(ISD::XOR, MVT::v4i16, Expand);
- setOperationAction(ISD::XOR, MVT::v2i32, Expand);
- setOperationAction(ISD::XOR, MVT::v1i64, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand);
+ for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
+ setOperationAction(ISD::MULHS, MMXTy, Expand);
+ setOperationAction(ISD::AND, MMXTy, Expand);
+ setOperationAction(ISD::OR, MMXTy, Expand);
+ setOperationAction(ISD::XOR, MMXTy, Expand);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand);
+ setOperationAction(ISD::SELECT, MMXTy, Expand);
+ setOperationAction(ISD::BITCAST, MMXTy, Expand);
+ }
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand);
- setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
- setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
- setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
- setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
- setOperationAction(ISD::BITCAST, MVT::v8i8, Expand);
- setOperationAction(ISD::BITCAST, MVT::v4i16, Expand);
- setOperationAction(ISD::BITCAST, MVT::v2i32, Expand);
- setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
-
- if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
+
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) {
addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
@@ -948,12 +785,13 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) {
addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
@@ -967,6 +805,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::ADD, MVT::v8i16, Legal);
setOperationAction(ISD::ADD, MVT::v4i32, Legal);
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
@@ -997,13 +836,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- // Only provide customized ctpop vector bit twiddling for vector types we
- // know to perform better than using the popcnt instructions on each vector
- // element. If popcnt isn't supported, always provide the custom version.
- if (!Subtarget->hasPOPCNT()) {
- setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
- setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
- }
+ setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
@@ -1016,6 +852,7 @@ void X86TargetLowering::resetOperationActions() {
continue;
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
@@ -1039,6 +876,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
@@ -1094,39 +933,20 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
- setOperationAction(ISD::FRINT, MVT::f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
- setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
- setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
- setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
- setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
- setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
- setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
- setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+ if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) {
+ for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ }
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
- // There is no BLENDI for byte vectors. We don't need to custom lower
- // some vselects for now.
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
@@ -1137,6 +957,21 @@ void X86TargetLowering::resetOperationActions() {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
}
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
// i8 and i16 vectors are custom because the source register and source
// source memory operand types are not the same width. f32 vectors are
// custom since the immediate controlling the insert encodes additional
@@ -1160,6 +995,10 @@ void X86TargetLowering::resetOperationActions() {
}
if (Subtarget->hasSSE2()) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
+
setOperationAction(ISD::SRL, MVT::v8i16, Custom);
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
@@ -1180,7 +1019,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
@@ -1252,11 +1091,6 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
-
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
@@ -1270,6 +1104,11 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
+
if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
setOperationAction(ISD::FMA, MVT::v8f32, Legal);
setOperationAction(ISD::FMA, MVT::v4f64, Legal);
@@ -1293,29 +1132,31 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, Legal);
setOperationAction(ISD::MUL, MVT::v16i16, Legal);
- // Don't lower v32i8 because there is no 128-bit byte mul
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
- // Only provide customized ctpop vector bit twiddling for vector types we
- // know to perform better than using the popcnt instructions on each
- // vector element. If popcnt isn't supported, always provide the custom
- // version.
- if (!Subtarget->hasPOPCNT())
- setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
-
- // Custom CTPOP always performs better on natively supported v8i32
- setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@@ -1330,7 +1171,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
setOperationAction(ISD::MUL, MVT::v8i32, Custom);
setOperationAction(ISD::MUL, MVT::v16i16, Custom);
- // Don't lower v32i8 because there is no 128-bit byte mul
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
}
// In the customized shift lowering, the legal cases in AVX2 will be
@@ -1360,6 +1201,7 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
@@ -1367,6 +1209,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
+ if (Subtarget->hasInt256())
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
@@ -1388,7 +1234,7 @@ void X86TargetLowering::resetOperationActions() {
}
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) {
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
@@ -1401,11 +1247,27 @@ void X86TargetLowering::resetOperationActions() {
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal);
+
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
+ setOperationAction(ISD::SUB, MVT::i1, Custom);
+ setOperationAction(ISD::ADD, MVT::i1, Custom);
+ setOperationAction(ISD::MUL, MVT::i1, Custom);
setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
@@ -1450,28 +1312,49 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom);
+ }
setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
+ }
+ setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal);
setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
@@ -1488,6 +1371,8 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
setOperationAction(ISD::ADD, MVT::v8i64, Legal);
setOperationAction(ISD::ADD, MVT::v16i32, Legal);
@@ -1517,10 +1402,23 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
}
-
+ if (Subtarget->hasDQI()) {
+ setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ }
// Custom lower several nodes.
for (MVT VT : MVT::vector_valuetypes()) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (EltSize == 1) {
+ setOperationAction(ISD::AND, VT, Legal);
+ setOperationAction(ISD::OR, VT, Legal);
+ setOperationAction(ISD::XOR, VT, Legal);
+ }
+ if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
if (VT.is128BitVector() || VT.is256BitVector()) {
@@ -1533,7 +1431,7 @@ void X86TargetLowering::resetOperationActions() {
if (!VT.is512BitVector())
continue;
- if ( EltSize >= 32) {
+ if (EltSize >= 32) {
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -1557,7 +1455,7 @@ void X86TargetLowering::resetOperationActions() {
}
}// has AVX-512
- if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) {
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
@@ -1573,6 +1471,24 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::SUB, MVT::v32i16, Legal);
setOperationAction(ISD::SUB, MVT::v64i8, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
const MVT VT = (MVT::SimpleValueType)i;
@@ -1590,13 +1506,20 @@ void X86TargetLowering::resetOperationActions() {
}
}
- if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
+ if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) {
addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom);
setOperationAction(ISD::AND, MVT::v8i32, Legal);
setOperationAction(ISD::OR, MVT::v8i32, Legal);
@@ -1604,13 +1527,10 @@ void X86TargetLowering::resetOperationActions() {
setOperationAction(ISD::AND, MVT::v4i32, Legal);
setOperationAction(ISD::OR, MVT::v4i32, Legal);
setOperationAction(ISD::XOR, MVT::v4i32, Legal);
+ setOperationAction(ISD::SRA, MVT::v2i64, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i64, Custom);
}
- // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
- // of this type with custom code.
- for (MVT VT : MVT::vector_valuetypes())
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
-
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -1667,6 +1587,7 @@ void X86TargetLowering::resetOperationActions() {
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
@@ -1687,16 +1608,14 @@ void X86TargetLowering::resetOperationActions() {
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
- setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
- if (Subtarget->is64Bit())
- setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
- computeRegisterProperties();
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// On Darwin, -Os means optimize for size without hurting performance,
// do not reduce the limit.
@@ -1837,8 +1756,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
MachineFunction &MF) const {
const Function *F = MF.getFunction();
if ((!IsMemset || ZeroMemset) &&
- !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) {
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
@@ -1898,6 +1816,10 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {
return TargetLowering::getJumpTableEncoding();
}
+bool X86TargetLowering::useSoftFloat() const {
+ return Subtarget->useSoftFloat();
+}
+
const MCExpr *
X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB,
@@ -1906,7 +1828,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
Subtarget->isPICStyleGOT());
// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
// entries.
- return MCSymbolRefExpr::Create(MBB->getSymbol(),
+ return MCSymbolRefExpr::create(MBB->getSymbol(),
MCSymbolRefExpr::VK_GOTOFF, Ctx);
}
@@ -1930,17 +1852,17 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
// Otherwise, the reference is relative to the PIC base.
- return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+ return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
}
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
- return TargetLowering::findRepresentativeClass(VT);
+ return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
@@ -2023,7 +1945,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
SmallVector<SDValue, 6> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
- RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
MVT::i16));
// Copy the result values into the output registers.
@@ -2038,10 +1960,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
else if (VA.getLocInfo() == CCValAssign::ZExt)
ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
- else if (VA.getLocInfo() == CCValAssign::AExt)
- ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt) {
+ if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ }
else if (VA.getLocInfo() == CCValAssign::BCvt)
- ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+ ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
@@ -2078,13 +2004,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
if (Subtarget->is64Bit()) {
if (ValVT == MVT::x86mmx) {
if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
- ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+ ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
if (!Subtarget->hasSSE2())
- ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+ ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
}
}
}
@@ -2094,19 +2020,17 @@ X86TargetLowering::LowerReturn(SDValue Chain,
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
- // The x86-64 ABIs require that for returning structs by value we copy
+ // All x86 ABIs require that for returning structs by value we copy
// the sret argument into %rax/%eax (depending on ABI) for the return.
- // Win32 requires us to put the sret argument to %eax as well.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- MachineFunction &MF = DAG.getMachineFunction();
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- assert(Reg &&
- "SRetReturnReg should have been set in LowerFormalArguments().");
- SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
unsigned RetValReg
= (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
@@ -2200,7 +2124,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
- EVT CopyVT = VA.getValVT();
+ EVT CopyVT = VA.getLocVT();
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
@@ -2210,18 +2134,24 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
+ bool RoundAfterCopy = false;
if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
- isScalarFPTypeInSSEReg(VA.getValVT()))
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
CopyVT = MVT::f80;
+ RoundAfterCopy = (CopyVT != VA.getLocVT());
+ }
Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
CopyVT, InFlag).getValue(1);
SDValue Val = Chain.getValue(0);
- if (CopyVT != VA.getValVT())
+ if (RoundAfterCopy)
Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
// This truncation won't change the value.
- DAG.getIntPtrConstant(1));
+ DAG.getIntPtrConstant(1, dl));
+
+ if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
+ Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
InFlag = Chain.getValue(2);
InVals.push_back(Val);
@@ -2281,10 +2211,11 @@ static SDValue
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
SDLoc dl) {
- SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
/*isVolatile*/false, /*AlwaysInline=*/true,
+ /*isTailCall*/false,
MachinePointerInfo(), MachinePointerInfo());
}
@@ -2337,7 +2268,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
// If value is passed by pointer we have address passed instead of the value
// itself.
- if (VA.getLocInfo() == CCValAssign::Indirect)
+ bool ExtendedInMem = VA.isExtInLoc() &&
+ VA.getValVT().getScalarType() == MVT::i1;
+
+ if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
ValVT = VA.getLocVT();
else
ValVT = VA.getValVT();
@@ -2355,9 +2289,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
VA.getLocMemOffset(), isImmutable);
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
- return DAG.getLoad(ValVT, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI),
- false, false, false, 0);
+ SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ false, false, false, 0);
+ return ExtendedInMem ?
+ DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
}
}
@@ -2393,12 +2329,11 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
}
const Function *Fn = MF.getFunction();
- bool NoImplicitFloatOps = Fn->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
- assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
+ bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool isSoftFloat = Subtarget->useSoftFloat();
+ assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
- if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
- !Subtarget->hasSSE1())
+ if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
// Kernel mode asks for SSE to be disabled, so there are no XMM argument
// registers.
return None;
@@ -2421,6 +2356,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
const Function* Fn = MF.getFunction();
if (Fn->hasExternalLinkage() &&
@@ -2501,11 +2437,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
DAG.getValueType(VA.getValVT()));
else if (VA.getLocInfo() == CCValAssign::BCvt)
- ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+ ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
if (VA.isExtInLoc()) {
// Handle MMX values passed in XMM regs.
- if (RegVT.isVector())
+ if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
@@ -2523,24 +2459,21 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
InVals.push_back(ArgValue);
}
- if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- // The x86-64 ABIs require that for returning structs by value we copy
- // the sret argument into %rax/%eax (depending on ABI) for the return.
- // Win32 requires us to put the sret argument to %eax as well.
- // Save the argument into a virtual register so that we can access it
- // from the return points.
- if (Ins[i].Flags.isSRet()) {
- unsigned Reg = FuncInfo->getSRetReturnReg();
- if (!Reg) {
- MVT PtrTy = getPointerTy();
- Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
- FuncInfo->setSRetReturnReg(Reg);
- }
- SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
- break;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ // All x86 ABIs require that for returning structs by value we copy the
+ // sret argument into %rax/%eax (depending on ABI) for the return. Save
+ // the argument into a virtual register so that we can access it from the
+ // return points.
+ if (Ins[i].Flags.isSRet()) {
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ MVT PtrTy = getPointerTy();
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+ FuncInfo->setSRetReturnReg(Reg);
}
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ break;
}
}
@@ -2560,10 +2493,16 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
MFI->CreateFixedObject(1, StackSize, true));
}
+ MachineModuleInfo &MMI = MF.getMMI();
+ const Function *WinEHParent = nullptr;
+ if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
+ WinEHParent = MMI.getWinEHParent(Fn);
+ bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
+ bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
+
// Figure out if XMM registers are in use.
- assert(!(MF.getTarget().Options.UseSoftFloat &&
- Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) &&
+ assert(!(Subtarget->useSoftFloat() &&
+ Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
// 64-bit calling conventions support varargs and register parameters, so we
@@ -2572,10 +2511,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs =
- CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
- unsigned NumXMMRegs =
- CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
@@ -2599,7 +2536,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
}
if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2625,7 +2561,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned Offset = FuncInfo->getVarArgsGPOffset();
for (SDValue Val : LiveGPRs) {
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
- DAG.getIntPtrConstant(Offset));
+ DAG.getIntPtrConstant(Offset, dl));
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN,
MachinePointerInfo::getFixedStack(
@@ -2641,9 +2577,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SaveXMMOps.push_back(Chain);
SaveXMMOps.push_back(ALVal);
SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex()));
+ FuncInfo->getRegSaveFrameIndex(), dl));
SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset()));
+ FuncInfo->getVarArgsFPOffset(), dl));
SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
LiveXMMRegs.end());
MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
@@ -2652,6 +2588,27 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
if (!MemOps.empty())
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ } else if (IsWinEHOutlined) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
+ /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
+
+ MMI.getWinEHFuncInfo(Fn)
+ .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
+ FuncInfo->getRegSaveFrameIndex();
+
+ // Store the second integer parameter (rdx) into rsp+16 relative to the
+ // stack pointer at the entry of the function.
+ SDValue RSFIN =
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
+ unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
+ Chain = DAG.getStore(
+ Val.getValue(1), dl, Val, RSFIN,
+ MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
+ /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
}
if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
@@ -2718,6 +2675,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
FuncInfo->setArgumentStackSize(StackSize);
+ if (IsWinEHParent) {
+ int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
+ SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
+ MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
+ SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64);
+ Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
+ MachinePointerInfo::getFixedStack(UnwindHelpFI),
+ /*isVolatile=*/true,
+ /*isNonTemporal=*/false, /*Alignment=*/0);
+ }
+
return Chain;
}
@@ -2728,7 +2696,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
const CCValAssign &VA,
ISD::ArgFlagsTy Flags) const {
unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
if (Flags.isByVal())
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
@@ -2798,6 +2766,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (MF.getTarget().Options.DisableTailCalls)
isTailCall = false;
+ if (Subtarget->isPICStyleGOT() &&
+ !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ // If we are using a GOT, disable tail calls to external symbols with
+ // default visibility. Tail calling such a symbol requires using a GOT
+ // relocation, which forces early binding of the symbol. This breaks code
+ // that require lazy function symbol resolution. Using musttail or
+ // GuaranteedTailCallOpt will override this.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (!G || (!G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility()))
+ isTailCall = false;
+ }
+
bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
@@ -2874,7 +2855,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!IsSibcall)
Chain = DAG.getCALLSEQ_START(
- Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
+ Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
SDValue RetAddrFrIdx;
// Load return address for tail calls.
@@ -2888,8 +2869,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2912,16 +2892,19 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::AExt:
- if (RegVT.is128BitVector()) {
+ if (Arg.getValueType().isVector() &&
+ Arg.getValueType().getScalarType() == MVT::i1)
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ else if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
- Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+ Arg = DAG.getBitcast(MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
} else
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+ Arg = DAG.getBitcast(RegVT, Arg);
break;
case CCValAssign::Indirect: {
// Store the argument.
@@ -2980,8 +2963,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Note: The actual moving to ECX is done further down.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
- if (G && !G->getGlobal()->hasHiddenVisibility() &&
- !G->getGlobal()->hasProtectedVisibility())
+ if (G && !G->getGlobal()->hasLocalLinkage() &&
+ G->getGlobal()->hasDefaultVisibility())
Callee = LowerGlobalAddress(Callee, DAG);
else if (isa<ExternalSymbolSDNode>(Callee))
Callee = LowerExternalSymbol(Callee, DAG);
@@ -3002,12 +2985,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
- DAG.getConstant(NumXMMRegs, MVT::i8)));
+ DAG.getConstant(NumXMMRegs, dl,
+ MVT::i8)));
}
if (isVarArg && IsMustTail) {
@@ -3051,7 +3035,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Flags.isByVal()) {
// Copy relative to framepointer.
- SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
if (!StackPtr.getNode())
StackPtr = DAG.getCopyFromReg(Chain, dl,
RegInfo->getStackRegister(),
@@ -3124,11 +3108,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
OpFlags = X86II::MO_DARWIN_STUB;
- } else if (Subtarget->isPICStyleRIPRel() &&
- isa<Function>(GV) &&
- cast<Function>(GV)->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NonLazyBind)) {
+ } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+ cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
@@ -3168,7 +3149,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
- } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
@@ -3179,8 +3161,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (!IsSibcall && isTailCall) {
Chain = DAG.getCALLSEQ_END(Chain,
- DAG.getIntPtrConstant(NumBytesToPop, true),
- DAG.getIntPtrConstant(0, true), InFlag, dl);
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
InFlag = Chain.getValue(1);
}
@@ -3188,7 +3170,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Ops.push_back(Callee);
if (isTailCall)
- Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+ Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
// Add argument registers to the end of the list so that they are known live
// into the call.
@@ -3197,8 +3179,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3212,6 +3194,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
+ MF.getFrameInfo()->setHasTailCall();
return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
}
@@ -3237,8 +3220,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
- DAG.getIntPtrConstant(NumBytesToPop, true),
- DAG.getIntPtrConstant(NumBytesForCalleeToPop,
+ DAG.getIntPtrConstant(NumBytesToPop, dl, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
true),
InFlag, dl);
InFlag = Chain.getValue(1);
@@ -3286,11 +3269,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
@@ -3327,7 +3307,8 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
return false;
} else {
unsigned Opcode = Def->getOpcode();
- if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
@@ -3392,6 +3373,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
@@ -3403,8 +3390,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
@@ -3516,8 +3502,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
@@ -3614,17 +3599,6 @@ static bool isTargetShuffle(unsigned Opcode) {
}
static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
- SDValue V1, SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::MOVSHDUP:
- case X86ISD::MOVSLDUP:
- case X86ISD::MOVDDUP:
- return DAG.getNode(Opc, dl, VT, V1);
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue V1, unsigned TargetMask,
SelectionDAG &DAG) {
switch(Opc) {
@@ -3634,21 +3608,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
case X86ISD::PSHUFLW:
case X86ISD::VPERMILPI:
case X86ISD::VPERMI:
- return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
- }
-}
-
-static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
- SDValue V1, SDValue V2, unsigned TargetMask,
- SelectionDAG &DAG) {
- switch(Opc) {
- default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::PALIGNR:
- case X86ISD::VALIGN:
- case X86ISD::SHUFP:
- case X86ISD::VPERM2X128:
- return DAG.getNode(Opc, dl, VT, V1, V2,
- DAG.getConstant(TargetMask, MVT::i8));
+ return DAG.getNode(Opc, dl, VT, V1,
+ DAG.getConstant(TargetMask, dl, MVT::i8));
}
}
@@ -3671,8 +3632,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
@@ -3759,13 +3719,13 @@ static bool isX86CCUnsigned(unsigned X86CC) {
/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
-static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP,
SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
if (!isFP) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
// X > -1 -> X == 0, jump !sign.
- RHS = DAG.getConstant(0, RHS.getValueType());
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_NS;
}
if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
@@ -3774,7 +3734,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
}
if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
// X < 1 -> X <= 0
- RHS = DAG.getConstant(0, RHS.getValueType());
+ RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
}
}
@@ -3939,849 +3899,6 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
return true;
}
-/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD. That is, it doesn't reference the other
-/// operand - by default will match for first operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
- bool TestSecondOperand = false) {
- if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
- VT != MVT::v2f64 && VT != MVT::v2i64)
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned Lo = TestSecondOperand ? NumElems : 0;
- unsigned Hi = Lo + NumElems;
-
- for (unsigned i = 0; i < NumElems; ++i)
- if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
- return false;
-
- return true;
-}
-
-/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
- return false;
-
- // Lower quadword copied in order or undef.
- if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
- return false;
-
- // Upper quadword shuffled.
- for (unsigned i = 4; i != 8; ++i)
- if (!isUndefOrInRange(Mask[i], 4, 8))
- return false;
-
- if (VT == MVT::v16i16) {
- // Lower quadword copied in order or undef.
- if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
- return false;
-
- // Upper quadword shuffled.
- for (unsigned i = 12; i != 16; ++i)
- if (!isUndefOrInRange(Mask[i], 12, 16))
- return false;
- }
-
- return true;
-}
-
-/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
- return false;
-
- // Upper quadword copied in order.
- if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
- return false;
-
- // Lower quadword shuffled.
- for (unsigned i = 0; i != 4; ++i)
- if (!isUndefOrInRange(Mask[i], 0, 4))
- return false;
-
- if (VT == MVT::v16i16) {
- // Upper quadword copied in order.
- if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
- return false;
-
- // Lower quadword shuffled.
- for (unsigned i = 8; i != 12; ++i)
- if (!isUndefOrInRange(Mask[i], 8, 12))
- return false;
- }
-
- return true;
-}
-
-/// \brief Return true if the mask specifies a shuffle of elements that is
-/// suitable for input to intralane (palignr) or interlane (valign) vector
-/// right-shift.
-static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- // Do not handle 64-bit element shuffles with palignr.
- if (NumLaneElts == 2)
- return false;
-
- for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
- unsigned i;
- for (i = 0; i != NumLaneElts; ++i) {
- if (Mask[i+l] >= 0)
- break;
- }
-
- // Lane is all undef, go to next lane
- if (i == NumLaneElts)
- continue;
-
- int Start = Mask[i+l];
-
- // Make sure its in this lane in one of the sources
- if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
- !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
- return false;
-
- // If not lane 0, then we must match lane 0
- if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
- return false;
-
- // Correct second source to be contiguous with first source
- if (Start >= (int)NumElts)
- Start -= NumElts - NumLaneElts;
-
- // Make sure we're shifting in the right direction.
- if (Start <= (int)(i+l))
- return false;
-
- Start -= i;
-
- // Check the rest of the elements to see if they are consecutive.
- for (++i; i != NumLaneElts; ++i) {
- int Idx = Mask[i+l];
-
- // Make sure its in this lane
- if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
- !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
- return false;
-
- // If not lane 0, then we must match lane 0
- if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
- return false;
-
- if (Idx >= (int)NumElts)
- Idx -= NumElts - NumLaneElts;
-
- if (!isUndefOrEqual(Idx, Start+i))
- return false;
-
- }
- }
-
- return true;
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
- (VT.is256BitVector() && !Subtarget->hasInt256()) ||
- VT.is512BitVector())
- // FIXME: Add AVX512BW.
- return false;
-
- return isAlignrMask(Mask, VT, false);
-}
-
-/// \brief Return true if the node specifies a shuffle of elements that is
-/// suitable for input to VALIGN.
-static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- // FIXME: Add AVX512VL.
- if (!VT.is512BitVector() || !Subtarget->hasAVX512())
- return false;
- return isAlignrMask(Mask, VT, true);
-}
-
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
- unsigned NumElems) {
- for (unsigned i = 0; i != NumElems; ++i) {
- int idx = Mask[i];
- if (idx < 0)
- continue;
- else if (idx < (int)NumElems)
- Mask[i] = idx + NumElems;
- else
- Mask[i] = idx - NumElems;
- }
-}
-
-/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128/256-bit
-/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
-/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElems = NumElems/NumLanes;
-
- if (NumLaneElems != 2 && NumLaneElems != 4)
- return false;
-
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- bool symetricMaskRequired =
- (VT.getSizeInBits() >= 256) && (EltSize == 32);
-
- // VSHUFPSY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
- // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
- //
- // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
- // Y3..Y0, Y3..Y0, X3..X0, X3..X0
- //
- // VSHUFPDY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X3 X2 X1 X0
- // SRC2 => Y3 Y2 Y1 Y0
- //
- // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
- //
- SmallVector<int, 4> MaskVal(NumLaneElems, -1);
- unsigned HalfLaneElems = NumLaneElems/2;
- for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- int Idx = Mask[i+l];
- unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
- if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
- return false;
- // For VSHUFPSY, the mask of the second half must be the same as the
- // first but with the appropriate offsets. This works in the same way as
- // VPERMILPS works with masks.
- if (!symetricMaskRequired || Idx < 0)
- continue;
- if (MaskVal[i] < 0) {
- MaskVal[i] = Idx - l;
- continue;
- }
- if ((signed)(Idx - l) != MaskVal[i])
- return false;
- }
- }
-
- return true;
-}
-
-/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 4)
- return false;
-
- // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
- return isUndefOrEqual(Mask[0], 6) &&
- isUndefOrEqual(Mask[1], 7) &&
- isUndefOrEqual(Mask[2], 2) &&
- isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
-/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
-/// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 4)
- return false;
-
- return isUndefOrEqual(Mask[0], 2) &&
- isUndefOrEqual(Mask[1], 3) &&
- isUndefOrEqual(Mask[2], 2) &&
- isUndefOrEqual(Mask[3], 3);
-}
-
-/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i + NumElems))
- return false;
-
- for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- return true;
-}
-
-/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i + e], i + NumElems))
- return false;
-
- return true;
-}
-
-/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to INSERTPS.
-/// i. e: If all but one element come from the same vector.
-static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
- // TODO: Deal with AVX's VINSERTPS
- if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
- return false;
-
- unsigned CorrectPosV1 = 0;
- unsigned CorrectPosV2 = 0;
- for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
- if (Mask[i] == -1) {
- ++CorrectPosV1;
- ++CorrectPosV2;
- continue;
- }
-
- if (Mask[i] == i)
- ++CorrectPosV1;
- else if (Mask[i] == i + 4)
- ++CorrectPosV2;
- }
-
- if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
- // We have 3 elements (undefs count as elements from any vector) from one
- // vector, and one from another.
- return true;
-
- return false;
-}
-
-//
-// Some special combinations that can be optimized.
-//
-static
-SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDLoc dl(SVOp);
-
- if (VT != MVT::v8i32 && VT != MVT::v8f32)
- return SDValue();
-
- ArrayRef<int> Mask = SVOp->getMask();
-
- // These are the special masks that may be optimized.
- static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
- static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
- bool MatchEvenMask = true;
- bool MatchOddMask = true;
- for (int i=0; i<8; ++i) {
- if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
- MatchEvenMask = false;
- if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
- MatchOddMask = false;
- }
-
- if (!MatchEvenMask && !MatchOddMask)
- return SDValue();
-
- SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
-
- SDValue Op0 = SVOp->getOperand(0);
- SDValue Op1 = SVOp->getOperand(1);
-
- if (MatchEvenMask) {
- // Shift the second operand right to 32 bits.
- static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
- Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
- } else {
- // Shift the first operand left to 32 bits.
- static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
- Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
- }
- static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
- return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
-}
-
-/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
- bool HasInt256, bool V2IsSplat = false) {
-
- assert(VT.getSizeInBits() >= 128 &&
- "Unsupported vector type for unpckl");
-
- unsigned NumElts = VT.getVectorNumElements();
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
- "Unsupported vector type for unpckh");
-
- // AVX defines UNPCK* to operate independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (!isUndefOrEqual(BitI1, NumElts))
- return false;
- } else {
- if (!isUndefOrEqual(BitI1, j + NumElts))
- return false;
- }
- }
- }
-
- return true;
-}
-
-/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
- bool HasInt256, bool V2IsSplat = false) {
- assert(VT.getSizeInBits() >= 128 &&
- "Unsupported vector type for unpckh");
-
- unsigned NumElts = VT.getVectorNumElements();
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
- "Unsupported vector type for unpckh");
-
- // AVX defines UNPCK* to operate independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (V2IsSplat) {
- if (isUndefOrEqual(BitI1, NumElts))
- return false;
- } else {
- if (!isUndefOrEqual(BitI1, j+NumElts))
- return false;
- }
- }
- }
- return true;
-}
-
-/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
-/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
-/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- unsigned NumElts = VT.getVectorNumElements();
- bool Is256BitVec = VT.is256BitVector();
-
- if (VT.is512BitVector())
- return false;
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unsupported vector type for unpckh");
-
- if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
- // FIXME: Need a better way to get rid of this, there's no latency difference
- // between UNPCKLPD and MOVDDUP, the later should always be checked first and
- // the former later. We should also remove the "_undef" special mask.
- if (NumElts == 4 && Is256BitVec)
- return false;
-
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
-
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
- }
- }
-
- return true;
-}
-
-/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
-/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
-/// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
- unsigned NumElts = VT.getVectorNumElements();
-
- if (VT.is512BitVector())
- return false;
-
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "Unsupported vector type for unpckh");
-
- if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
- (!HasInt256 || (NumElts != 16 && NumElts != 32)))
- return false;
-
- // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
- // independently on 128-bit lanes.
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
- for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
- int BitI = Mask[l+i];
- int BitI1 = Mask[l+i+1];
- if (!isUndefOrEqual(BitI, j))
- return false;
- if (!isUndefOrEqual(BitI1, j))
- return false;
- }
- }
- return true;
-}
-
-// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
-// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
-static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
- if (!VT.is512BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfSize = NumElts/2;
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
- *Imm = 1;
- return true;
- }
- }
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
- *Imm = 0;
- return true;
- }
- }
- return false;
-}
-
-/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSS,
-/// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
- if (VT.getVectorElementType().getSizeInBits() < 32)
- return false;
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
-
- if (!isUndefOrEqual(Mask[0], NumElts))
- return false;
-
- for (unsigned i = 1; i != NumElts; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
-
- return true;
-}
-
-/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
-/// as permutations between 128-bit chunks or halves. As an example: this
-/// shuffle bellow:
-/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
-/// The first half comes from the second half of V1 and the second half from the
-/// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
- if (!HasFp256 || !VT.is256BitVector())
- return false;
-
- // The shuffle result is divided into half A and half B. In total the two
- // sources have 4 halves, namely: C, D, E, F. The final values of A and
- // B must come from C, D, E or F.
- unsigned HalfSize = VT.getVectorNumElements()/2;
- bool MatchA = false, MatchB = false;
-
- // Check if A comes from one of C, D, E, F.
- for (unsigned Half = 0; Half != 4; ++Half) {
- if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
- MatchA = true;
- break;
- }
- }
-
- // Check if B comes from one of C, D, E, F.
- for (unsigned Half = 0; Half != 4; ++Half) {
- if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
- MatchB = true;
- break;
- }
- }
-
- return MatchA && MatchB;
-}
-
-/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
-static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
- MVT VT = SVOp->getSimpleValueType(0);
-
- unsigned HalfSize = VT.getVectorNumElements()/2;
-
- unsigned FstHalf = 0, SndHalf = 0;
- for (unsigned i = 0; i < HalfSize; ++i) {
- if (SVOp->getMaskElt(i) > 0) {
- FstHalf = SVOp->getMaskElt(i)/HalfSize;
- break;
- }
- }
- for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
- if (SVOp->getMaskElt(i) > 0) {
- SndHalf = SVOp->getMaskElt(i)/HalfSize;
- break;
- }
- }
-
- return (FstHalf | (SndHalf << 4));
-}
-
-// Symetric in-lane mask. Each lane has 4 elements (for imm8)
-static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (EltSize < 32)
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- Imm8 = 0;
- if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
- for (unsigned i = 0; i != NumElts; ++i) {
- if (Mask[i] < 0)
- continue;
- Imm8 |= Mask[i] << (i*2);
- }
- return true;
- }
-
- unsigned LaneSize = 4;
- SmallVector<int, 4> MaskVal(LaneSize, -1);
-
- for (unsigned l = 0; l != NumElts; l += LaneSize) {
- for (unsigned i = 0; i != LaneSize; ++i) {
- if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
- return false;
- if (Mask[i+l] < 0)
- continue;
- if (MaskVal[i] < 0) {
- MaskVal[i] = Mask[i+l] - l;
- Imm8 |= MaskVal[i] << (i*2);
- continue;
- }
- if (Mask[i+l] != (signed)(MaskVal[i]+l))
- return false;
- }
- }
- return true;
-}
-
-/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- if (VT.getSizeInBits() < 256 || EltSize < 32)
- return false;
- bool symetricMaskRequired = (EltSize == 32);
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned LaneSize = NumElts/NumLanes;
- // 2 or 4 elements in one lane
-
- SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
- for (unsigned l = 0; l != NumElts; l += LaneSize) {
- for (unsigned i = 0; i != LaneSize; ++i) {
- if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
- return false;
- if (symetricMaskRequired) {
- if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
- ExpectedMaskVal[i] = Mask[i+l] - l;
- continue;
- }
- if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
- return false;
- }
- }
- }
- return true;
-}
-
-/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
-/// of what x86 movss want. X86 movs requires the lowest element to be lowest
-/// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
- bool V2IsSplat = false, bool V2IsUndef = false) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned NumOps = VT.getVectorNumElements();
- if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
- return false;
-
- if (!isUndefOrEqual(Mask[0], 0))
- return false;
-
- for (unsigned i = 1; i != NumOps; ++i)
- if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
- (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
- (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
- return false;
-
- return true;
-}
-
-/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8) ||
- (VT.is512BitVector() && NumElems != 16))
- return false;
-
- // "i+1" is the value the indexed mask element must have
- for (unsigned i = 0; i != NumElems; i += 2)
- if (!isUndefOrEqual(Mask[i], i+1) ||
- !isUndefOrEqual(Mask[i+1], i+1))
- return false;
-
- return true;
-}
-
-/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
- const X86Subtarget *Subtarget) {
- if (!Subtarget->hasSSE3())
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if ((VT.is128BitVector() && NumElems != 4) ||
- (VT.is256BitVector() && NumElems != 8) ||
- (VT.is512BitVector() && NumElems != 16))
- return false;
-
- // "i" is the value the indexed mask element must have
- for (unsigned i = 0; i != NumElems; i += 2)
- if (!isUndefOrEqual(Mask[i], i) ||
- !isUndefOrEqual(Mask[i+1], i))
- return false;
-
- return true;
-}
-
-/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
- if (!HasFp256 || !VT.is256BitVector())
- return false;
-
- unsigned NumElts = VT.getVectorNumElements();
- if (NumElts != 4)
- return false;
-
- for (unsigned i = 0; i != NumElts/2; ++i)
- if (!isUndefOrEqual(Mask[i], 0))
- return false;
- for (unsigned i = NumElts/2; i != NumElts; ++i)
- if (!isUndefOrEqual(Mask[i], NumElts/2))
- return false;
- return true;
-}
-
-/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
-/// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- unsigned e = VT.getVectorNumElements() / 2;
- for (unsigned i = 0; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
- for (unsigned i = 0; i != e; ++i)
- if (!isUndefOrEqual(Mask[e+i], i))
- return false;
- return true;
-}
-
/// isVEXTRACTIndex - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for instruction that extract 128 or 256 bit vectors
@@ -4835,125 +3952,6 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
return isVEXTRACTIndex(N, 256);
}
-/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-/// Handles 128-bit and 256-bit.
-static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT.getSizeInBits() >= 128) &&
- "Unsupported vector type for PSHUF/SHUFP");
-
- // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
- // independently on 128-bit lanes.
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
- "Only supports 2, 4 or 8 elements per lane");
-
- unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
- unsigned Mask = 0;
- for (unsigned i = 0; i != NumElts; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt < 0) continue;
- Elt &= NumLaneElts - 1;
- unsigned ShAmt = (i << Shift) % 8;
- Mask |= Elt << ShAmt;
- }
-
- return Mask;
-}
-
-/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
-static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
- "Unsupported vector type for PSHUFHW");
-
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned Mask = 0;
- for (unsigned l = 0; l != NumElts; l += 8) {
- // 8 nodes per lane, but we only care about the last 4.
- for (unsigned i = 0; i < 4; ++i) {
- int Elt = N->getMaskElt(l+i+4);
- if (Elt < 0) continue;
- Elt &= 0x3; // only 2-bits.
- Mask |= Elt << (i * 2);
- }
- }
-
- return Mask;
-}
-
-/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
-static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
- MVT VT = N->getSimpleValueType(0);
-
- assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
- "Unsupported vector type for PSHUFHW");
-
- unsigned NumElts = VT.getVectorNumElements();
-
- unsigned Mask = 0;
- for (unsigned l = 0; l != NumElts; l += 8) {
- // 8 nodes per lane, but we only care about the first 4.
- for (unsigned i = 0; i < 4; ++i) {
- int Elt = N->getMaskElt(l+i);
- if (Elt < 0) continue;
- Elt &= 0x3; // only 2-bits
- Mask |= Elt << (i * 2);
- }
- }
-
- return Mask;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
-/// VALIGN (if Interlane is true) instructions.
-static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
- bool InterLane) {
- MVT VT = SVOp->getSimpleValueType(0);
- unsigned EltSize = InterLane ? 1 :
- VT.getVectorElementType().getSizeInBits() >> 3;
-
- unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
- unsigned NumLaneElts = NumElts/NumLanes;
-
- int Val = 0;
- unsigned i;
- for (i = 0; i != NumElts; ++i) {
- Val = SVOp->getMaskElt(i);
- if (Val >= 0)
- break;
- }
- if (Val >= (int)NumElts)
- Val -= NumElts - NumLaneElts;
-
- assert(Val - i > 0 && "PALIGNR imm should be positive");
- return (Val - i) * EltSize;
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
-static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
- return getShuffleAlignrImmediate(SVOp, false);
-}
-
-/// \brief Return the appropriate immediate to shuffle the specified
-/// VECTOR_SHUFFLE mask with the VALIGN instruction.
-static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
- return getShuffleAlignrImmediate(SVOp, true);
-}
-
-
static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -5028,119 +4026,6 @@ bool X86::isZeroNode(SDValue Elt) {
return false;
}
-/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
-/// match movhlps. The lower half elements should come from upper half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
- if (VT.getVectorNumElements() != 4)
- return false;
- for (unsigned i = 0, e = 2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i+2))
- return false;
- for (unsigned i = 2; i != 4; ++i)
- if (!isUndefOrEqual(Mask[i], i+4))
- return false;
- return true;
-}
-
-/// isScalarLoadToVector - Returns true if the node is a scalar load that
-/// is promoted to a vector. It also returns the LoadSDNode by reference if
-/// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
- if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
- return false;
- N = N->getOperand(0).getNode();
- if (!ISD::isNON_EXTLoad(N))
- return false;
- if (LD)
- *LD = cast<LoadSDNode>(N);
- return true;
-}
-
-// Test whether the given value is a vector value which will be legalized
-// into a load.
-static bool WillBeConstantPoolLoad(SDNode *N) {
- if (N->getOpcode() != ISD::BUILD_VECTOR)
- return false;
-
- // Check for any non-constant elements.
- for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
- switch (N->getOperand(i).getNode()->getOpcode()) {
- case ISD::UNDEF:
- case ISD::ConstantFP:
- case ISD::Constant:
- break;
- default:
- return false;
- }
-
- // Vectors of all-zeros and all-ones are materialized with special
- // instructions rather than being loaded.
- return !ISD::isBuildVectorAllZeros(N) &&
- !ISD::isBuildVectorAllOnes(N);
-}
-
-/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
-/// match movlp{s|d}. The lower half elements should come from lower half of
-/// V1 (and in order), and the upper half elements should come from the upper
-/// half of V2 (and in order). And since V1 will become the source of the
-/// MOVLP, it must be either a vector load or a scalar load to vector.
-static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
- ArrayRef<int> Mask, MVT VT) {
- if (!VT.is128BitVector())
- return false;
-
- if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
- return false;
- // Is V2 is a vector load, don't do this transformation. We will try to use
- // load folding shufps op.
- if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
- return false;
-
- unsigned NumElems = VT.getVectorNumElements();
-
- if (NumElems != 2 && NumElems != 4)
- return false;
- for (unsigned i = 0, e = NumElems/2; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i))
- return false;
- for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
- if (!isUndefOrEqual(Mask[i], i+NumElems))
- return false;
- return true;
-}
-
-/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
-/// to an zero vector.
-/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
-static bool isZeroShuffle(ShuffleVectorSDNode *N) {
- SDValue V1 = N->getOperand(0);
- SDValue V2 = N->getOperand(1);
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
- for (unsigned i = 0; i != NumElems; ++i) {
- int Idx = N->getMaskElt(i);
- if (Idx >= (int)NumElems) {
- unsigned Opc = V2.getOpcode();
- if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
- return false;
- } else if (Idx >= 0) {
- unsigned Opc = V1.getOpcode();
- if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
- continue;
- if (Opc != ISD::BUILD_VECTOR ||
- !X86::isZeroNode(V1.getOperand(Idx)))
- return false;
- }
- }
- return true;
-}
-
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
@@ -5152,38 +4037,198 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Vec;
if (VT.is128BitVector()) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
- SDValue Cst = DAG.getConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
- SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
} else if (VT.is256BitVector()) { // AVX
if (Subtarget->hasInt256()) { // AVX2
- SDValue Cst = DAG.getConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
- SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
} else if (VT.is512BitVector()) { // AVX-512
- SDValue Cst = DAG.getConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
- assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
- SDValue Cst = DAG.getConstant(0, MVT::i1);
- SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+
+ assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
+ && "Unexpected vector type");
+ assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
+ && "Unexpected vector type");
+ SDValue Cst = DAG.getConstant(0, dl, MVT::i1);
+ SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
llvm_unreachable("Unexpected vector type");
- return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+ return DAG.getBitcast(VT, Vec);
+}
+
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl,
+ unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ unsigned Factor = VT.getSizeInBits()/vectorWidth;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
+
+ // Extract from UNDEF is UNDEF.
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return DAG.getUNDEF(ResultVT);
+
+ // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
+ unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
+ * ElemsPerChunk);
+
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
+ ElemsPerChunk));
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
+}
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert((Vec.getValueType().is256BitVector() ||
+ Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
+}
+
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+ return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
+ SDLoc dl, unsigned vectorWidth) {
+ assert((vectorWidth == 128 || vectorWidth == 256) &&
+ "Unsupported vector width");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+ EVT VT = Vec.getValueType();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
+
+ // Insert the relevant vectorWidth bits.
+ unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+
+ // This is the index of the first element of the vectorWidth-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
+ * ElemsPerChunk);
+
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits. This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference. Idx is an index in the 128 bits
+/// we want. It need not be aligned to a 128-bit boundary. That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+ // For insertion into the zero index (low half) of a 256-bit vector, it is
+ // more efficient to generate a blend with immediate instead of an insert*128.
+ // We are still creating an INSERT_SUBVECTOR below with an undef node to
+ // extend the subvector to the size of the result vector. Make sure that
+ // we are not recursing on that node by checking for undef here.
+ if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+ Result.getOpcode() != ISD::UNDEF) {
+ EVT ResultVT = Result.getValueType();
+ SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(ResultVT);
+ SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+ Vec, ZeroIndex);
+
+ // The blend instruction, and therefore its mask, depend on the data type.
+ MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+ if (ScalarType.isFloatingPoint()) {
+ // Choose either vblendps (float) or vblendpd (double).
+ unsigned ScalarSize = ScalarType.getSizeInBits();
+ assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+ unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+ SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+ }
+
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+ // AVX2 is needed for 256-bit integer blend support.
+ // Integers must be cast to 32-bit because there is only vpblendd;
+ // vpblendw can't be used for this because it has a handicapped mask.
+
+ // If we don't have AVX2, then cast to float. Using a wrong domain blend
+ // is still more efficient than using the wrong domain vinsertf128 that
+ // will be created by InsertSubVector().
+ MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+ SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
+ Vec256 = DAG.getBitcast(CastVT, Vec256);
+ Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+ return DAG.getBitcast(ResultVT, Vec256);
+ }
+
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, SDLoc dl) {
+ assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+ return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ SDLoc dl) {
+ SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
/// getOnesVector - Returns a vector of specified type with all bits set.
@@ -5194,7 +4239,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- SDValue Cst = DAG.getConstant(~0U, MVT::i32);
+ SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
SDValue Vec;
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
@@ -5209,17 +4254,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
} else
llvm_unreachable("Unexpected vector type");
- return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
-}
-
-/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
-/// that point to V2 points to its first element.
-static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
- for (unsigned i = 0; i != NumElems; ++i) {
- if (Mask[i] > (int)NumElems) {
- Mask[i] = NumElems;
- }
- }
+ return DAG.getBitcast(VT, Vec);
}
/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -5258,92 +4293,6 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
-// a generic shuffle instruction because the target has no such instructions.
-// Generate shuffles which repeat i16 and i8 several times until they can be
-// represented by v4f32 and then be manipulated by target suported shuffles.
-static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
- MVT VT = V.getSimpleValueType();
- int NumElems = VT.getVectorNumElements();
- SDLoc dl(V);
-
- while (NumElems > 4) {
- if (EltNo < NumElems/2) {
- V = getUnpackl(DAG, dl, VT, V, V);
- } else {
- V = getUnpackh(DAG, dl, VT, V, V);
- EltNo -= NumElems/2;
- }
- NumElems >>= 1;
- }
- return V;
-}
-
-/// getLegalSplat - Generate a legal splat with supported x86 shuffles
-static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
- MVT VT = V.getSimpleValueType();
- SDLoc dl(V);
-
- if (VT.is128BitVector()) {
- V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
- int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
- &SplatMask[0]);
- } else if (VT.is256BitVector()) {
- // To use VPERMILPS to splat scalars, the second half of indicies must
- // refer to the higher part, which is a duplication of the lower one,
- // because VPERMILPS can only handle in-lane permutations.
- int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
- EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
-
- V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
- V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
- &SplatMask[0]);
- } else
- llvm_unreachable("Vector size not supported");
-
- return DAG.getNode(ISD::BITCAST, dl, VT, V);
-}
-
-/// PromoteSplat - Splat is promoted to target supported vector shuffles.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
- MVT SrcVT = SV->getSimpleValueType(0);
- SDValue V1 = SV->getOperand(0);
- SDLoc dl(SV);
-
- int EltNo = SV->getSplatIndex();
- int NumElems = SrcVT.getVectorNumElements();
- bool Is256BitVec = SrcVT.is256BitVector();
-
- assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
- "Unknown how to promote splat for type");
-
- // Extract the 128-bit part containing the splat element and update
- // the splat element index when it refers to the higher register.
- if (Is256BitVec) {
- V1 = Extract128BitVector(V1, EltNo, DAG, dl);
- if (EltNo >= NumElems/2)
- EltNo -= NumElems/2;
- }
-
- // All i16 and i8 vector types can't be used directly by a generic shuffle
- // instruction because the target has no such instruction. Generate shuffles
- // which repeat i16 and i8 several times until they fit in i32, and then can
- // be manipulated by target suported shuffles.
- MVT EltVT = SrcVT.getVectorElementType();
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- V1 = PromoteSplati8i16(V1, DAG, EltNo);
-
- // Recreate the 256-bit vector and place the same 128-bit vector
- // into the low and high part. This is necessary because we want
- // to use VPERM* to shuffle the vectors
- if (Is256BitVec) {
- V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
- }
-
- return getLegalSplat(DAG, V1, EltNo);
-}
-
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector. This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
@@ -5467,7 +4416,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
return false;
SDValue Ptr = MaskLoad->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper)
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
Ptr = Ptr->getOperand(0);
auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
@@ -5489,16 +4439,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
IsUnary = true;
break;
case X86ISD::MOVSS:
- case X86ISD::MOVSD: {
- // The index 0 always comes from the first element of the second source,
- // this is why MOVSS and MOVSD are used in the first place. The other
- // elements come from the other positions of the first source vector
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i) {
- Mask.push_back(i);
- }
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
- }
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5506,11 +4449,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
@@ -5594,148 +4542,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
return SDValue();
}
-/// getNumOfConsecutiveZeros - Return the number of elements of a vector
-/// shuffle operation which come from a consecutively from a zero. The
-/// search can start in two different directions, from left or right.
-/// We count undefs as zeros until PreferredNum is reached.
-static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
- unsigned NumElems, bool ZerosFromLeft,
- SelectionDAG &DAG,
- unsigned PreferredNum = -1U) {
- unsigned NumZeros = 0;
- for (unsigned i = 0; i != NumElems; ++i) {
- unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
- SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
- if (!Elt.getNode())
- break;
-
- if (X86::isZeroNode(Elt))
- ++NumZeros;
- else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
- NumZeros = std::min(NumZeros + 1, PreferredNum);
- else
- break;
- }
-
- return NumZeros;
-}
-
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
-/// correspond consecutively to elements from one of the vector operands,
-/// starting from its index OpIdx. Also tell OpNum which source vector operand.
-static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
- unsigned MaskI, unsigned MaskE, unsigned OpIdx,
- unsigned NumElems, unsigned &OpNum) {
- bool SeenV1 = false;
- bool SeenV2 = false;
-
- for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
- int Idx = SVOp->getMaskElt(i);
- // Ignore undef indicies
- if (Idx < 0)
- continue;
-
- if (Idx < (int)NumElems)
- SeenV1 = true;
- else
- SeenV2 = true;
-
- // Only accept consecutive elements from the same vector
- if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
- return false;
- }
-
- OpNum = SeenV1 ? 0 : 1;
- return true;
-}
-
-/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- unsigned NumElems =
- SVOp->getSimpleValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(
- SVOp, NumElems, false /* check zeros from right */, DAG,
- SVOp->getMaskElt(0));
- unsigned OpSrc;
-
- if (!NumZeros)
- return false;
-
- // Considering the elements in the mask that are not consecutive zeros,
- // check if they consecutively come from only one of the source vectors.
- //
- // V1 = {X, A, B, C} 0
- // \ \ \ /
- // vector_shuffle V1, V2 <1, 2, 3, X>
- //
- if (!isShuffleMaskConsecutive(SVOp,
- 0, // Mask Start Index
- NumElems-NumZeros, // Mask End Index(exclusive)
- NumZeros, // Where to start looking in the src vector
- NumElems, // Number of elements in vector
- OpSrc)) // Which source operand ?
- return false;
-
- isLeft = false;
- ShAmt = NumZeros;
- ShVal = SVOp->getOperand(OpSrc);
- return true;
-}
-
-/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
-/// logical left shift of a vector.
-static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- unsigned NumElems =
- SVOp->getSimpleValueType(0).getVectorNumElements();
- unsigned NumZeros = getNumOfConsecutiveZeros(
- SVOp, NumElems, true /* check zeros from left */, DAG,
- NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
- unsigned OpSrc;
-
- if (!NumZeros)
- return false;
-
- // Considering the elements in the mask that are not consecutive zeros,
- // check if they consecutively come from only one of the source vectors.
- //
- // 0 { A, B, X, X } = V2
- // / \ / /
- // vector_shuffle V1, V2 <X, X, 4, 5>
- //
- if (!isShuffleMaskConsecutive(SVOp,
- NumZeros, // Mask Start Index
- NumElems, // Mask End Index(exclusive)
- 0, // Where to start looking in the src vector
- NumElems, // Number of elements in vector
- OpSrc)) // Which source operand ?
- return false;
-
- isLeft = true;
- ShAmt = NumZeros;
- ShVal = SVOp->getOperand(OpSrc);
- return true;
-}
-
-/// isVectorShift - Returns true if the shuffle can be implemented as a
-/// logical left or right shift of a vector.
-static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
- bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
- // Although the logic below support any bitwidth size, there are no
- // shift instructions which handle more than 128-bit vectors.
- if (!SVOp->getSimpleValueType(0).is128BitVector())
- return false;
-
- if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
- isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
- return true;
-
- return false;
-}
-
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
@@ -5749,6 +4555,29 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
SDLoc dl(Op);
SDValue V;
bool First = true;
+
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget->hasSSE41()) {
+ for (unsigned i = 0; i < 16; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v16i8);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v16i8, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+ }
+
+ return V;
+ }
+
+ // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
@@ -5769,7 +4598,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
if (ThisIsNonZero) {
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
- ThisElt, DAG.getConstant(8, MVT::i8));
+ ThisElt, DAG.getConstant(8, dl, MVT::i8));
if (LastIsNonZero)
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
} else
@@ -5777,11 +4606,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
if (ThisElt.getNode())
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i/2));
+ DAG.getIntPtrConstant(i/2, dl));
}
}
- return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+ return DAG.getBitcast(MVT::v16i8, V);
}
/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
@@ -5809,7 +4638,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
MVT::v8i16, V, Op.getOperand(i),
- DAG.getIntPtrConstant(i));
+ DAG.getIntPtrConstant(i, dl));
}
}
@@ -5821,13 +4650,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
// Find all zeroable elements.
- bool Zeroable[4];
+ std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
SDValue Elt = Op->getOperand(i);
Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
}
- assert(std::count_if(&Zeroable[0], &Zeroable[4],
- [](bool M) { return !M; }) > 1 &&
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
@@ -5913,31 +4741,28 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (Zeroable[i])
- ZMask |= 1 << i;
+ unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
- SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
- DAG.getIntPtrConstant(InsertPSMask));
- return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
+ SDLoc DL(Op);
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask, DL));
+ return DAG.getBitcast(VT, Result);
}
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- EVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
- SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opc, dl, ShVT, SrcOp,
- DAG.getConstant(NumBits,
- TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+ SrcOp = DAG.getBitcast(ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
+ return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue
@@ -5992,9 +4817,11 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
if ((Offset % RequiredAlign) & 3)
return SDValue();
int64_t StartOffset = Offset & ~(RequiredAlign-1);
- if (StartOffset)
- Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
- Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+ if (StartOffset) {
+ SDLoc DL(Ptr);
+ Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+ DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
+ }
int EltNo = (Offset - StartOffset) >> 2;
unsigned NumElems = VT.getVectorNumElements();
@@ -6004,9 +4831,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- Mask.push_back(EltNo);
+ SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
@@ -6014,19 +4839,18 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
return SDValue();
}
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
- EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
LoadSDNode *LDBase = nullptr;
@@ -6037,7 +4861,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
-
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
@@ -6052,7 +4878,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();
LastLoadedElt = i;
}
@@ -6061,6 +4892,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
if (isAfterLegalize &&
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
@@ -6087,6 +4924,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
//TODO: The code below fires only for for loading the low v2i32 / v2f32
//of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -6109,7 +4947,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
SDValue(ResNode.getNode(), 1));
}
- return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+ return DAG.getBitcast(VT, ResNode);
}
return SDValue();
}
@@ -6212,8 +5050,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
const Function *F = DAG.getMachineFunction().getFunction();
- bool OptForSize = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -6377,95 +5214,117 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
unsigned Idx = InsertIndices[i];
NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
- DAG.getIntPtrConstant(Idx));
+ DAG.getIntPtrConstant(Idx, DL));
}
return NV;
}
+static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) {
+ assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
+ Op.getScalarValueSizeInBits() == 1 &&
+ "Can not convert non-constant vector");
+ uint64_t Immediate = 0;
+ for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+ SDValue In = Op.getOperand(idx);
+ if (In.getOpcode() != ISD::UNDEF)
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ }
+ SDLoc dl(Op);
+ MVT VT =
+ MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
+ return DAG.getConstant(Immediate, dl, VT);
+}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
SDValue
X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
- assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+ assert((VT.getVectorElementType() == MVT::i1) &&
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode())) {
- SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1);
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
if (ISD::isBuildVectorAllOnes(Op.getNode())) {
- SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
+ SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1);
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
}
- bool AllContants = true;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+ SDValue Imm = ConvertI1VectorToInterger(Op, DAG);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, Imm);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ // Vector has one or more non-const elements
uint64_t Immediate = 0;
- int NonConstIdx = -1;
+ SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
- unsigned NumNonConsts = 0;
- unsigned NumConsts = 0;
+ bool HasConstElts = false;
+ int SplatIdx = -1;
for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
SDValue In = Op.getOperand(idx);
if (In.getOpcode() == ISD::UNDEF)
continue;
- if (!isa<ConstantSDNode>(In)) {
- AllContants = false;
- NonConstIdx = idx;
- NumNonConsts++;
- } else {
- NumConsts++;
- if (cast<ConstantSDNode>(In)->getZExtValue())
- Immediate |= (1ULL << idx);
+ if (!isa<ConstantSDNode>(In))
+ NonConstIdx.push_back(idx);
+ else {
+ Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+ HasConstElts = true;
}
- if (In != Op.getOperand(0))
+ if (SplatIdx == -1)
+ SplatIdx = idx;
+ else if (In != Op.getOperand(SplatIdx))
IsSplat = false;
}
- if (AllContants) {
- SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
- DAG.getConstant(Immediate, MVT::i16));
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
- DAG.getIntPtrConstant(0));
- }
-
- if (NumNonConsts == 1 && NonConstIdx != 0) {
- SDValue DstVec;
- if (NumConsts) {
- SDValue VecAsImm = DAG.getConstant(Immediate,
- MVT::getIntegerVT(VT.getSizeInBits()));
- DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
- }
- else
- DstVec = DAG.getUNDEF(VT);
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
- Op.getOperand(NonConstIdx),
- DAG.getIntPtrConstant(NonConstIdx));
- }
- if (!IsSplat && (NonConstIdx != 0))
- llvm_unreachable("Unsupported BUILD_VECTOR operation");
- MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
- SDValue Select;
+ // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
if (IsSplat)
- Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
- DAG.getConstant(-1, SelectVT),
- DAG.getConstant(0, SelectVT));
+ return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
+ DAG.getConstant(1, dl, VT),
+ DAG.getConstant(0, dl, VT));
+
+ // insert elements one by one
+ SDValue DstVec;
+ SDValue Imm;
+ if (Immediate) {
+ MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
+ Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ }
+ else if (HasConstElts)
+ Imm = DAG.getConstant(0, dl, VT);
else
- Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
- DAG.getConstant((Immediate | 1), SelectVT),
- DAG.getConstant(Immediate, SelectVT));
- return DAG.getNode(ISD::BITCAST, dl, VT, Select);
+ Imm = DAG.getUNDEF(VT);
+ if (Imm.getValueSizeInBits() == VT.getSizeInBits())
+ DstVec = DAG.getBitcast(VT, Imm);
+ else {
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ for (unsigned i = 0; i < NonConstIdx.size(); ++i) {
+ unsigned InsertIdx = NonConstIdx[i];
+ DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ Op.getOperand(InsertIdx),
+ DAG.getIntPtrConstant(InsertIdx, dl));
+ }
+ return DstVec;
}
/// \brief Return true if \p N implements a horizontal binop and return the
/// operands for the horizontal binop into V0 and V1.
///
-/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This is a helper function of LowerToHorizontalOp().
/// This function checks that the build_vector \p N in input implements a
/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
/// operation to match.
@@ -6528,11 +5387,17 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
if (i * 2 < NumElts) {
- if (V0.getOpcode() == ISD::UNDEF)
+ if (V0.getOpcode() == ISD::UNDEF) {
V0 = Op0.getOperand(0);
+ if (V0.getValueType() != VT)
+ return false;
+ }
} else {
- if (V1.getOpcode() == ISD::UNDEF)
+ if (V1.getOpcode() == ISD::UNDEF) {
V1 = Op0.getOperand(0);
+ if (V1.getValueType() != VT)
+ return false;
+ }
if (i * 2 == NumElts)
ExpectedVExtractIdx = BaseIdx;
}
@@ -6556,7 +5421,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
/// a concat_vector.
///
-/// This is a helper function of PerformBUILD_VECTORCombine.
+/// This is a helper function of LowerToHorizontalOp().
/// This function expects two 256-bit vectors called V0 and V1.
/// At first, each vector is split into two separate 128-bit vectors.
/// Then, the resulting 128-bit vectors are used to implement two
@@ -6622,12 +5487,16 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
-/// \brief Try to fold a build_vector that performs an 'addsub' into the
-/// sequence of 'vadd + vsub + blendi'.
-static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- SDLoc DL(BV);
+/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
+/// node.
+static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
EVT VT = BV->getValueType(0);
+ if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
+ (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
+ return SDValue();
+
+ SDLoc DL(BV);
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
@@ -6644,7 +5513,7 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
bool AddFound = false;
bool SubFound = false;
- for (unsigned i = 0, e = NumElts; i != e; i++) {
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
@@ -6682,10 +5551,16 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
SubFound = true;
// Update InVec0 and InVec1.
- if (InVec0.getOpcode() == ISD::UNDEF)
+ if (InVec0.getOpcode() == ISD::UNDEF) {
InVec0 = Op0.getOperand(0);
- if (InVec1.getOpcode() == ISD::UNDEF)
+ if (InVec0.getValueType() != VT)
+ return SDValue();
+ }
+ if (InVec1.getOpcode() == ISD::UNDEF) {
InVec1 = Op1.getOperand(0);
+ if (InVec1.getValueType() != VT)
+ return SDValue();
+ }
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
@@ -6715,23 +5590,12 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
+/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
+static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ EVT VT = BV->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
- BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
- SDValue InVec0, InVec1;
-
- // Try to match an ADDSUB.
- if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
- (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
- SDValue Value = matchAddSub(BV, DAG, Subtarget);
- if (Value.getNode())
- return Value;
- }
-
- // Try to match horizontal ADD/SUB.
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
unsigned Half = NumElts/2;
@@ -6750,6 +5614,8 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
return SDValue();
+ SDLoc DL(BV);
+ SDValue InVec0, InVec1;
if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
// Try to match an SSE3 float HADD/HSUB.
if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
@@ -6894,8 +5760,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
}
- SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
- if (Broadcast.getNode())
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+ if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
return Broadcast;
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -6941,32 +5811,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
EVT VecVT = MVT::v4i32;
- unsigned VecElts = 4;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-
- // If using the new shuffle lowering, just directly insert this.
- if (ExperimentalVectorShuffleLowering)
- return DAG.getNode(
- ISD::BITCAST, dl, VT,
- getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
-
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
-
- // Now we have our 32-bit value zero extended in the low element of
- // a vector. If Idx != 0, swizzle it into place.
- if (Idx != 0) {
- SmallVector<int, 4> Mask;
- Mask.push_back(Idx);
- for (unsigned i = 1; i != VecElts; ++i)
- Mask.push_back(i);
- Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
- &Mask[0]);
- }
- return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+ return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
+ Item, Idx * 2, true, Subtarget, DAG));
}
}
@@ -6980,28 +5831,39 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
- if (VT.is256BitVector() || VT.is512BitVector()) {
+ if (VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
- Item, DAG.getIntPtrConstant(0));
+ Item, DAG.getIntPtrConstant(0, dl));
}
- assert(VT.is128BitVector() && "Expected an SSE value type!");
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
if (VT.is256BitVector()) {
- SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
- Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ if (Subtarget->hasAVX()) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+ } else {
+ // Without AVX, we need to extend to a 128-bit vector and then
+ // insert into the 256-bit vector.
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+ SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+ }
} else {
assert(VT.is128BitVector() && "Expected an SSE value type!");
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
- return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+ return DAG.getBitcast(VT, Item);
}
}
@@ -7026,17 +5888,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-
- // If using the new shuffle lowering, just directly insert this.
- if (ExperimentalVectorShuffleLowering)
- return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
-
- // Turn it into a shuffle of zero and zero-extended scalar to vector.
- Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
- SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i != NumElems; ++i)
- MaskVec.push_back(i == Idx ? 0 : 1);
- return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
}
}
@@ -7064,9 +5916,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// elements, otherwise build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
- SmallVector<SDValue, 64> V;
- for (unsigned i = 0; i != NumElems; ++i)
- V.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
// Check for a build vector of consecutive loads.
if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
@@ -7099,24 +5949,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
- if (EVTBits == 8 && NumElems == 16) {
- SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 8 && NumElems == 16)
+ if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
- if (EVTBits == 16 && NumElems == 8) {
- SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
- Subtarget, *this);
- if (V.getNode()) return V;
- }
+ if (EVTBits == 16 && NumElems == 8)
+ if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ Subtarget, *this))
+ return V;
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
- if (EVTBits == 32 && NumElems == 4) {
- SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
- if (V.getNode())
+ if (EVTBits == 32 && NumElems == 4)
+ if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
return V;
- }
// If element VT is == 32 bits, turn it into a number of shuffles.
SmallVector<SDValue, 8> V(NumElems);
@@ -7164,17 +6010,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
V[i] = Op.getOperand(i);
// Check for elements which are consecutive loads.
- SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
- if (LD.getNode())
+ if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
return LD;
// Check for a build vector from mostly shuffle plus few inserting.
- SDValue Sh = buildFromShuffleMostly(Op, DAG);
- if (Sh.getNode())
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (Subtarget->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -7184,7 +6028,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
- Op.getOperand(i), DAG.getIntPtrConstant(i));
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
return Result;
}
@@ -7236,7 +6080,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
- if(ResVT.is256BitVector())
+ if (ResVT.is256BitVector())
return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
if (Op.getNumOperands() == 4) {
@@ -7250,8 +6094,64 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG & DAG) {
+ SDLoc dl(Op);
+ MVT ResVT = Op.getSimpleValueType();
+ unsigned NumOfOperands = Op.getNumOperands();
+
+ assert(isPowerOf2_32(NumOfOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ if (NumOfOperands > 2) {
+ MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
+ ResVT.getVectorNumElements()/2);
+ SmallVector<SDValue, 2> Ops;
+ for (unsigned i = 0; i < NumOfOperands/2; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ Ops.clear();
+ for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
+ Ops.push_back(Op.getOperand(i));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+ }
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ if (IsZeroV1 && IsZeroV2)
+ return getZeroVector(ResVT, Subtarget, DAG, dl);
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(ResVT);
+ unsigned NumElems = ResVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
+
+ V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
+ V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
+ if (IsZeroV1)
+ return V2;
+
+ V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
+ // Zero the upper bits of V1
+ V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
+ V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
+ if (IsZeroV2)
+ return V1;
+ return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT.getVectorElementType() == MVT::i1)
+ return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
+
assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
@@ -7354,38 +6254,76 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return true;
}
-// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
-// 2013 will allow us to use it as a non-type template parameter.
-namespace {
-
-/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
+/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
///
-/// See its documentation for details.
-bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
- if (Mask.size() != Args.size())
- return false;
- for (int i = 0, e = Mask.size(); i < e; ++i) {
- assert(*Args[i] >= 0 && "Arguments must be positive integers!");
- if (Mask[i] != -1 && Mask[i] != *Args[i])
+/// This checks a shuffle mask to see if it is performing the same
+/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 256-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = 256 / VT.getScalarSizeInBits();
+ RepeatedMask.resize(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ if (RepeatedMask[i % LaneSize] == -1)
+ // This is the first non-undef entry in this slot of a 256-bit lane.
+ RepeatedMask[i % LaneSize] =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+ else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+ // Found a mismatch with the repeated mask.
return false;
}
return true;
}
-} // namespace
-
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
///
-/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
///
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-static const VariadicFunction1<
- bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ if (Mask.size() != ExpectedMask.size())
+ return false;
+
+ int Size = Mask.size();
+
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (!MaskBV || !ExpectedBV ||
+ MaskBV->getOperand(Mask[i] % Size) !=
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ return false;
+ }
+
+ return true;
+}
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
@@ -7395,7 +6333,7 @@ static const VariadicFunction1<
/// example.
///
/// NB: We rely heavily on "undef" masks preserving the input lane.
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
SelectionDAG &DAG) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
@@ -7408,7 +6346,54 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
- return DAG.getConstant(Imm, MVT::i8);
+ return DAG.getConstant(Imm, DL, MVT::i8);
+}
+
+/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask.
+///
+/// This helper function produces an 8-bit shuffle immediate corresponding to
+/// the ubiquitous shuffle encoding scheme used in x86 instructions for
+/// shuffling 8 lanes.
+static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+ SelectionDAG &DAG) {
+ assert(Mask.size() <= 8 &&
+ "Up to 8 elts may be in Imm8 1-bit lane shuffle mask");
+ unsigned Imm = 0;
+ for (unsigned i = 0; i < Mask.size(); ++i)
+ if (Mask[i] >= 0)
+ Imm |= (Mask[i] % 2) << i;
+ return DAG.getConstant(Imm, DL, MVT::i8);
+}
+
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ SDValue Zero = DAG.getConstant(0, DL, EltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ // We have to cast V2 around.
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+ DAG.getBitcast(MaskVT, V1Mask),
+ DAG.getBitcast(MaskVT, V2)));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
}
/// \brief Try to emit a blend instruction for a shuffle.
@@ -7421,7 +6406,6 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
-
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= Size) {
@@ -7439,7 +6423,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
case MVT::v4f64:
case MVT::v8f32:
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ DAG.getConstant(BlendMask, DL, MVT::i8));
case MVT::v4i64:
case MVT::v8i32:
@@ -7459,11 +6443,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
BlendMask |= 1u << (i * Scale + j);
MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
- V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8)));
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
}
// FALLTHROUGH
case MVT::v8i16: {
@@ -7476,11 +6460,11 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
for (int j = 0; j < Scale; ++j)
BlendMask |= 1u << (i * Scale + j);
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8)));
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = DAG.getBitcast(MVT::v8i16, V2);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, DL, MVT::i8)));
}
case MVT::v16i16: {
@@ -7494,15 +6478,21 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
if (RepeatedMask[i] >= 16)
BlendMask |= 1u << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ DAG.getConstant(BlendMask, DL, MVT::i8));
}
}
// FALLTHROUGH
+ case MVT::v16i8:
case MVT::v32i8: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
// Scale the blend by the number of bytes per element.
- int Scale = VT.getScalarSizeInBits() / 8;
- assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
@@ -7515,20 +6505,20 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
- SDValue VSELECTMask[32];
+ SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
- VSELECTMask[Scale * i + j] =
+ VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
+ MVT::i8));
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
- return DAG.getNode(
- ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
- V1, V2));
+ V1 = DAG.getBitcast(BlendVT, V1);
+ V2 = DAG.getBitcast(BlendVT, V2);
+ return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ BlendVT, VSELECTMask),
+ V1, V2));
}
default:
@@ -7536,12 +6526,45 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
}
}
-/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
-/// unblended shuffles followed by an unshuffled blend.
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] == -1)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
-/// operations.
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
SDValue V1,
SDValue V2,
@@ -7561,6 +6584,16 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
BlendMask[i] = i + Size;
}
+ // Try to lower with the simpler initial blend strategy unless one of the
+ // input shuffles would be a no-op. We prefer to shuffle inputs as the
+ // shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, blending
+ // first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
@@ -7582,8 +6615,6 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-///
-/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
@@ -7591,6 +6622,10 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
@@ -7600,44 +6635,52 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] == -1)
- continue;
- assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ for (int i = 0; i < NumLaneElts; ++i) {
+ if (Mask[l + i] == -1)
+ continue;
+ assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
- // Based on the mod-Size value of this mask element determine where
- // a rotated vector would have started.
- int StartIdx = i - (Mask[i] % Size);
- if (StartIdx == 0)
- // The identity rotation isn't interesting, stop.
- return SDValue();
+ // Get the mod-Size index and lane correct it.
+ int LaneIdx = (Mask[l + i] % NumElts) - l;
+ // Make sure it was in this lane.
+ if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+ return SDValue();
- // If we found the tail of a vector the rotation must be the missing
- // front. If we found the head of a vector, it must be how much of the head.
- int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - LaneIdx;
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return SDValue();
- if (Rotation == 0)
- Rotation = CandidateRotation;
- else if (Rotation != CandidateRotation)
- // The rotations don't match, so we can't match this mask.
- return SDValue();
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
- // Compute which value this mask is pointing at.
- SDValue MaskV = Mask[i] < Size ? V1 : V2;
-
- // Compute which of the two target values this index should be assigned to.
- // This reflects whether the high elements are remaining or the low elements
- // are remaining.
- SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
-
- // Either set up this value if we've not encountered it before, or check
- // that it remains consistent.
- if (!TargetV)
- TargetV = MaskV;
- else if (TargetV != MaskV)
- // This may be a rotation, but it pulls from the inputs in some
- // unsupported interleaving.
- return SDValue();
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return SDValue();
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return SDValue();
+ }
}
// Check that we successfully analyzed the mask, and normalize the results.
@@ -7648,40 +6691,41 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
else if (!Hi)
Hi = Lo;
- assert(VT.getSizeInBits() == 128 &&
- "Rotate-based lowering only supports 128-bit lowering!");
- assert(Mask.size() <= 16 &&
- "Can shuffle at most 16 bytes in a 128-bit vector!");
-
// The actual rotate instruction rotates bytes, so we need to scale the
- // rotation based on how many bytes are in the vector.
- int Scale = 16 / Mask.size();
+ // rotation based on how many bytes are in the vector lane.
+ int Scale = 16 / NumLaneElts;
- // SSSE3 targets can use the palignr instruction
+ // SSSE3 targets can use the palignr instruction.
if (Subtarget->hasSSSE3()) {
- // Cast the inputs to v16i8 to match PALIGNR.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+ // Cast the inputs to i8 vector of correct length to match PALIGNR.
+ MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ Lo = DAG.getBitcast(AlignVT, Lo);
+ Hi = DAG.getBitcast(AlignVT, Hi);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
- DAG.getConstant(Rotation * Scale, MVT::i8)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
+ DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
}
+ assert(VT.getSizeInBits() == 128 &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+
// Default SSE2 implementation
int LoByteShift = 16 - Rotation * Scale;
int HiByteShift = Rotation * Scale;
// Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+ Lo = DAG.getBitcast(MVT::v2i64, Lo);
+ Hi = DAG.getBitcast(MVT::v2i64, Hi);
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
- DAG.getConstant(8 * LoByteShift, MVT::i8));
+ DAG.getConstant(LoByteShift, DL, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
- DAG.getConstant(8 * HiByteShift, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+ DAG.getConstant(HiByteShift, DL, MVT::i8));
+ return DAG.getBitcast(VT,
+ DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
/// \brief Compute whether each element of a shuffle is zeroable.
@@ -7696,6 +6740,11 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
SDValue V1, SDValue V2) {
SmallBitVector Zeroable(Mask.size(), false);
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
@@ -7707,10 +6756,10 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
continue;
}
- // If this is an index into a build_vector node, dig out the input value and
- // use it.
+ // If this is an index into a build_vector node (which has the same number
+ // of elements), dig out the input value and use it.
SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR)
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
continue;
SDValue Input = V.getOperand(M % Size);
@@ -7723,78 +6772,135 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
-/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
-///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
-/// byte-shift instructions. The mask must consist of a shifted sequential
-/// shuffle from one of the input vectors and zeroable elements for the
-/// remaining 'shifted in' elements.
+/// \brief Try to emit a bitmask instruction for a shuffle.
///
-/// Note that this only handles 128-bit vector widths currently.
-static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
+ IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getBitcast(EltVT, Zero);
+ AllOnes = DAG.getBitcast(EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Size = Mask.size();
- int Scale = 16 / Size;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
- for (int Shift = 1; Shift < Size; Shift++) {
- int ByteShift = Shift * Scale;
-
- // PSRLDQ : (little-endian) right byte shift
- // [ 5, 6, 7, zz, zz, zz, zz, zz]
- // [ -1, 5, 6, 7, zz, zz, zz, zz]
- // [ 1, 2, -1, -1, -1, -1, zz, zz]
- bool ZeroableRight = true;
- for (int i = Size - Shift; i < Size; i++) {
- ZeroableRight &= Zeroable[i];
- }
-
- if (ZeroableRight) {
- bool ValidShiftRight1 =
- isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
- bool ValidShiftRight2 =
- isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
-
- if (ValidShiftRight1 || ValidShiftRight2) {
- // Cast the inputs to v2i64 to match PSRLDQ.
- SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
- }
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
- // PSLLDQ : (little-endian) left byte shift
- // [ zz, 0, 1, 2, 3, 4, 5, 6]
- // [ zz, zz, -1, -1, 2, 3, 4, -1]
- // [ zz, zz, zz, zz, zz, zz, -1, 1]
- bool ZeroableLeft = true;
- for (int i = 0; i < Shift; i++) {
- ZeroableLeft &= Zeroable[i];
- }
-
- if (ZeroableLeft) {
- bool ValidShiftLeft1 =
- isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
- bool ValidShiftLeft2 =
- isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
-
- if (ValidShiftLeft1 || ValidShiftLeft2) {
- // Cast the inputs to v2i64 to match PSLLDQ.
- SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
+ return true;
+ };
+
+ auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+ Low + (V == V1 ? 0 : Size)))
+ return SDValue();
}
- }
+ int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+
+ V = DAG.getNode(OpCode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left))
+ for (SDValue V : {V1, V2})
+ if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+ return Match;
+
+ // no match
return SDValue();
}
@@ -7804,10 +6910,11 @@ static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int EltBits = VT.getSizeInBits() / NumElements;
+ int NumElements = VT.getVectorNumElements();
+ int EltBits = VT.getScalarSizeInBits();
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
@@ -7815,35 +6922,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
- MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
}
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
if (AnyExt && EltBits == 32) {
int PSHUFDMask[4] = {0, -1, 1, -1};
- return DAG.getNode(
- ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
- getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
if (AnyExt && EltBits == 16 && Scale > 2) {
int PSHUFDMask[4] = {0, -1, 0, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
- getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
+ DAG.getBitcast(MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
int PSHUFHWMask[4] = {1, -1, -1, -1};
- return DAG.getNode(
- ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
- getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+ DAG.getBitcast(MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
}
// If this would require more than 2 unpack instructions to expand, use
@@ -7854,12 +6956,12 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i)
PSHUFBMask[i] =
- DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
- InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
- DAG.getNode(ISD::BUILD_VECTOR, DL,
- MVT::v16i8, PSHUFBMask)));
+ DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
+ InputV = DAG.getBitcast(MVT::v16i8, InputV);
+ return DAG.getBitcast(VT,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v16i8, PSHUFBMask)));
}
// Otherwise emit a sequence of unpacks.
@@ -7867,16 +6969,16 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+ InputV = DAG.getBitcast(InputVT, InputV);
InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
Scale /= 2;
EltBits *= 2;
NumElements /= 2;
} while (Scale > 1);
- return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
+ return DAG.getBitcast(VT, InputV);
}
-/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
@@ -7894,7 +6996,10 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
- int NumElements = Mask.size();
+ int NumElements = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
@@ -7905,11 +7010,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
- // Each of the extend elements needs to be zeroable.
+ // Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
- // We no lorger are in the anyext case.
+ // We no longer are in the anyext case.
AnyExt = false;
continue;
}
@@ -7923,7 +7028,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
return SDValue(); // Flip-flopping inputs.
if (Mask[i] % NumElements != i / Scale)
- return SDValue(); // Non-consecutive strided elemenst.
+ return SDValue(); // Non-consecutive strided elements.
}
// If we fail to find an input, we have a zero-shuffle which should always
@@ -7933,7 +7038,7 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
return SDValue();
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+ DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
@@ -7945,11 +7050,34 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
- "The input vector size must be divisble by the extended size.");
+ "The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getBitcast(MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getBitcast(VT, V);
+ }
+
// No viable ext lowering found.
return SDValue();
}
@@ -7970,8 +7098,13 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
return SDValue();
if (V.getOpcode() == ISD::BUILD_VECTOR ||
- (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
- return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
+ (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
+ // Ensure the scalar operand is the same size as the destination.
+ // FIXME: Add support for scalar truncation where possible.
+ SDValue S = V.getOperand(Idx);
+ if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
+ return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S);
+ }
return SDValue();
}
@@ -7992,7 +7125,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
- MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
MVT ExtVT = VT;
@@ -8016,7 +7149,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
if (SDValue V2S = getScalarValueForVectorElement(
V2, Mask[V2Index] - Mask.size(), DAG)) {
// We need to zext the scalar if it is smaller than an i32.
- V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+ V2S = DAG.getBitcast(EltVT, V2S);
if (EltVT == MVT::i8 || EltVT == MVT::i16) {
// Using zext to expand a narrow element won't work for non-zero
// insertions.
@@ -8059,9 +7192,13 @@ static SDValue lowerVectorShuffleAsElementInsertion(
ExtVT, V1, V2);
}
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
- V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+ V2 = DAG.getBitcast(VT, V2);
if (V2Index != 0) {
// If we have 4 or fewer lanes we can cheaply shuffle the element into
@@ -8073,13 +7210,13 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V2Shuffle[V2Index] = 0;
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+ V2 = DAG.getBitcast(MVT::v2i64, V2);
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
DAG.getConstant(
- V2Index * EltVT.getSizeInBits(),
+ V2Index * EltVT.getSizeInBits()/8, DL,
DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
- V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+ V2 = DAG.getBitcast(VT, V2);
}
}
return V2;
@@ -8090,7 +7227,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
@@ -8111,8 +7248,8 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
"a sorted mask where the broadcast "
"comes from V1.");
- // Go up the chain of (vector) values to try and find a scalar load that
- // we can combine with the broadcast.
+ // Go up the chain of (vector) values to find a scalar load that we can
+ // combine with the broadcast.
for (;;) {
switch (V.getOpcode()) {
case ISD::CONCAT_VECTORS: {
@@ -8149,12 +7286,12 @@ static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
- // If the scalar isn't a load we can't broadcast from it in AVX1, only with
- // AVX2.
+ // If the scalar isn't a load, we can't broadcast from it in AVX1.
+ // Only AVX2 has register broadcasts.
if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
return SDValue();
} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
- // We can't broadcast from a vector register w/o AVX2, and we can only
+ // We can't broadcast from a vector register without AVX2, and we can only
// broadcast from the zero-element of a vector register.
return SDValue();
}
@@ -8183,7 +7320,7 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
int V2DstIndex = -1;
bool V1UsedInPlace = false;
- for (int i = 0; i < 4; i++) {
+ for (int i = 0; i < 4; ++i) {
// Synthesize a zero mask from the zeroable elements (includes undefs).
if (Zeroable[i]) {
ZMask |= 1 << i;
@@ -8237,7 +7374,122 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
// Insert the V2 element into the desired position.
SDLoc DL(Op);
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // If we will have to shuffle both inputs to use the unpack, check whether
+ // we can just unpack first and shuffle the result. If so, skip this unpack.
+ if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
+ !isNoopShuffleMask(V2Mask))
+ return SDValue();
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ V1 = DAG.getBitcast(UnpackVT, V1);
+ V2 = DAG.getBitcast(UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getBitcast(
+ VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigNumElements = VT.getVectorNumElements();
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+ int Scale = ScalarSize / OrigScalarSize;
+ int NumElements = OrigNumElements / Scale;
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+ if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ return Unpack;
+ }
+
+ // If none of the unpack-rooted lowerings worked (or were profitable) try an
+ // initial unpack.
+ if (NumLoInputs == 0 || NumHiInputs == 0) {
+ assert((NumLoInputs > 0 || NumHiInputs > 0) &&
+ "We have to have *some* inputs!");
+ int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
+
+ // FIXME: We could consider the total complexity of the permute of each
+ // possible unpacking. Or at the least we should consider how many
+ // half-crossings are created.
+ // FIXME: We could consider commuting the unpacks.
+
+ SmallVector<int, 32> PermMask;
+ PermMask.assign(Size, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
+
+ PermMask[i] =
+ 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
+ }
+ return DAG.getVectorShuffle(
+ VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
+ DL, VT, V1, V2),
+ DAG.getUNDEF(VT), PermMask);
+ }
+
+ return SDValue();
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
@@ -8259,6 +7511,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
@@ -8267,38 +7524,33 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
- DAG.getConstant(SHUFPDMask, MVT::i8));
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
- return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
- DAG.getConstant(SHUFPDMask, MVT::i8));
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
-
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
}
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {1, 3}))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
@@ -8312,9 +7564,15 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Subtarget, DAG))
return Blend;
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
- return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
- DAG.getConstant(SHUFPDMask, MVT::i8));
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
@@ -8336,54 +7594,77 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
+ V1 = DAG.getBitcast(MVT::v4i32, V1);
int WidenedMask[4] = {
std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
- return DAG.getNode(
- ISD::BITCAST, DL, MVT::v2i64,
- DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
- getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
- }
+ return DAG.getBitcast(
+ MVT::v2i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
+ getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
+ }
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ // If we have a blend of two PACKUS operations an the blend aligns with the
+ // low and half halves, we can just merge the PACKUS operations. This is
+ // particularly important as it lets us merge shuffles that this routine itself
+ // creates.
+ auto GetPackNode = [](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2))
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
return Shift;
- // If we have a single input from V2 insert that into V1 if we can do so
- // cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
- // Try inverting the insertion since for v2 masks it is easy to do and we
- // can't reliably sort the mask one way or the other.
- int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
- Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
- return Insertion;
- }
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
- if (Subtarget->hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Subtarget, DAG))
- return Blend;
-
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
@@ -8391,14 +7672,38 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+ Mask, DAG);
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
// have this problem. It would be really nice if x86 had better shuffles here.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
- return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
- DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+ V1 = DAG.getBitcast(MVT::v2f64, V1);
+ V2 = DAG.getBitcast(MVT::v2f64, V2);
+ return DAG.getBitcast(MVT::v2i64,
+ DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
+}
+
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
}
/// \brief Lower a vector shuffle using the SHUFPS instruction.
@@ -8437,7 +7742,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
int V1Index = V2AdjIndex;
int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
- getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now proceed to reconstruct the final blend as we have the necessary
// high or low half formed.
@@ -8476,7 +7781,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
- getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+ getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
// Now we do a normal shuffle of V1 by giving V1 as both operands to
// a blend.
@@ -8488,7 +7793,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
}
}
return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
- getV4X86ShuffleImm8ForMask(NewMask, DAG));
+ getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
}
/// \brief Lower 4-lane 32-bit floating point shuffles.
@@ -8512,36 +7817,38 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
// Otherwise, use a straight shuffle of a single input vector. We pass the
// input vector to both operands to simulate this with a SHUFPS.
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return V;
@@ -8553,8 +7860,23 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Use INSERTPS if we can complete the shuffle efficiently.
if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
return V;
+
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
}
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
@@ -8586,7 +7908,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -8597,37 +7919,48 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+ else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v4i32, V1, V2, Mask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
+
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
@@ -8635,16 +7968,26 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Unpack;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
// relevant.
- return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
- DAG.getVectorShuffle(
- MVT::v4f32, DL,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
- DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
+ return DAG.getBitcast(
+ MVT::v4i32,
+ DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
+ DAG.getBitcast(MVT::v4f32, V2), Mask));
}
/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
@@ -8658,10 +8001,18 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
- SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+ SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
@@ -8686,27 +8037,6 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
- Mask, Subtarget, DAG))
- return Broadcast;
-
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V, V, Mask, DAG))
- return Shift;
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
- if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
- // Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
- return Rotate;
-
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
@@ -8811,7 +8141,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
+ getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
for (int &M : Mask)
if (M != -1 && M == FixIdx)
@@ -8835,10 +8165,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
- getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// Adjust the mask to match the new locations of A and B.
for (int &M : Mask)
@@ -8849,8 +8179,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- Mask);
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
@@ -9073,16 +8403,16 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
- getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ V = DAG.getBitcast(
+ VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
// At this point, each half should contain all its inputs, and we can then
// just shuffle them into their final position.
@@ -9095,172 +8425,70 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(LoMask, DAG));
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
// Do a half shuffle with the high mask after shifting its values down.
for (int &M : HiMask)
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
- getV4X86ShuffleImm8ForMask(HiMask, DAG));
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
+ getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
return V;
}
-/// \brief Detect whether the mask pattern should be lowered through
-/// interleaving.
-///
-/// This essentially tests whether viewing the mask as an interleaving of two
-/// sub-sequences reduces the cross-input traffic of a blend operation. If so,
-/// lowering it through interleaving is a significantly better strategy.
-static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
- int NumEvenInputs[2] = {0, 0};
- int NumOddInputs[2] = {0, 0};
- int NumLoInputs[2] = {0, 0};
- int NumHiInputs[2] = {0, 0};
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
-
- int InputIdx = Mask[i] >= Size;
-
- if (i < Size / 2)
- ++NumLoInputs[InputIdx];
- else
- ++NumHiInputs[InputIdx];
-
- if ((i % 2) == 0)
- ++NumEvenInputs[InputIdx];
- else
- ++NumOddInputs[InputIdx];
- }
-
- // The minimum number of cross-input results for both the interleaved and
- // split cases. If interleaving results in fewer cross-input results, return
- // true.
- int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
- NumEvenInputs[0] + NumOddInputs[1]);
- int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
- NumLoInputs[0] + NumHiInputs[1]);
- return InterleavedCrosses < SplitCrosses;
-}
-
-/// \brief Blend two v8i16 vectors using a naive unpack strategy.
-///
-/// This strategy only works when the inputs from each vector fit into a single
-/// half of that vector, and generally there are not so many inputs as to leave
-/// the in-place shuffles required highly constrained (and thus expensive). It
-/// shifts all the inputs into a single side of both input vectors and then
-/// uses an unpack to interleave these inputs in a single vector. At that
-/// point, we will fall back on the generic single input shuffle lowering.
-static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
- SDValue V2,
- MutableArrayRef<int> Mask,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
- assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
- SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
- for (int i = 0; i < 8; ++i)
- if (Mask[i] >= 0 && Mask[i] < 4)
- LoV1Inputs.push_back(i);
- else if (Mask[i] >= 4 && Mask[i] < 8)
- HiV1Inputs.push_back(i);
- else if (Mask[i] >= 8 && Mask[i] < 12)
- LoV2Inputs.push_back(i);
- else if (Mask[i] >= 12)
- HiV2Inputs.push_back(i);
-
- int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
- int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
- (void)NumV1Inputs;
- (void)NumV2Inputs;
- assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
- assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
- assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
-
- bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
- HiV1Inputs.size() + HiV2Inputs.size();
-
- auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
- ArrayRef<int> HiInputs, bool MoveToLo,
- int MaskOffset) {
- ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
- ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
- if (BadInputs.empty())
- return V;
-
- int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int MoveOffset = MoveToLo ? 0 : 4;
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
- if (GoodInputs.empty()) {
- for (int BadInput : BadInputs) {
- MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
- Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
- }
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
- if (GoodInputs.size() == 2) {
- // If the low inputs are spread across two dwords, pack them into
- // a single dword.
- MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
- MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
- Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
- Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
- } else {
- // Otherwise pin the good inputs.
- for (int GoodInput : GoodInputs)
- MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
- }
-
- if (BadInputs.size() == 2) {
- // If we have two bad inputs then there may be either one or two good
- // inputs fixed in place. Find a fixed input, and then find the *other*
- // two adjacent indices by using modular arithmetic.
- int GoodMaskIdx =
- std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
- [](int M) { return M >= 0; }) -
- std::begin(MoveMask);
- int MoveMaskIdx =
- ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
- assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
- assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
- MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
- MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
- Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
- Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
- } else {
- assert(BadInputs.size() == 1 && "All sizes handled");
- int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
- std::end(MoveMask), -1) -
- std::begin(MoveMask);
- MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
- Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
- }
- }
-
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- MoveMask);
- };
- V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
- /*MaskOffset*/ 0);
- V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
- /*MaskOffset*/ 8);
-
- // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
- // cross-half traffic in the final shuffle.
-
- // Munge the mask to be a single-input mask after the unpack merges the
- // results.
- for (int &M : Mask)
- if (M != -1)
- M = 2 * (M % 4) + (M / 8);
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getBitcast(MVT::v16i8, V2),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
- return DAG.getVectorShuffle(
- MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
- DL, MVT::v8i16, V1, V2),
- DAG.getUNDEF(MVT::v8i16), Mask);
+ // Cast the result back to the correct type.
+ return DAG.getBitcast(VT, V);
}
/// \brief Generic lowering of 8-lane i16 shuffles.
@@ -9297,85 +8525,95 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return ZExt;
auto isV1 = [](int M) { return M >= 0 && M < 8; };
+ (void)isV1;
auto isV2 = [](int M) { return M >= 8; };
- int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
- if (NumV2Inputs == 0)
- return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+ if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG);
+ }
- assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
- "to be V1-input shuffles.");
+ assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
+ "All single-input shuffles should be canonicalized to be V1-input "
+ "shuffles.");
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V1, V2, Mask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Rotate;
- if (NumV1Inputs + NumV2Inputs <= 4)
- return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
-
- // Check whether an interleaving lowering is likely to be more efficient.
- // This isn't perfect but it is a strong heuristic that tends to work well on
- // the kinds of shuffles that show up in practice.
- //
- // FIXME: Handle 1x, 2x, and 4x interleaving.
- if (shouldLowerAsInterleaving(Mask)) {
- // FIXME: Figure out whether we should pack these into the low or high
- // halves.
-
- int EMask[8], OMask[8];
- for (int i = 0; i < 4; ++i) {
- EMask[i] = Mask[2*i];
- OMask[i] = Mask[2*i + 1];
- EMask[i + 4] = -1;
- OMask[i + 4] = -1;
- }
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
- SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
- SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Unpack;
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
+ // If we can't directly blend but can use PSHUFB, that will be better as it
+ // can both shuffle and set up the inefficient blend.
+ if (!IsBlendSupported && Subtarget->hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
}
- int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-
- for (int i = 0; i < 4; ++i) {
- LoBlendMask[i] = Mask[i];
- HiBlendMask[i] = Mask[i + 4];
- }
-
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
- HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
-
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
}
/// \brief Check whether a compaction lowering can be done by dropping even
@@ -9461,40 +8699,31 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> OrigMask = SVOp->getMask();
- assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return ZExt;
- int MaskStorage[16] = {
- OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
- OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
- OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
- OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
- MutableArrayRef<int> Mask(MaskStorage);
- MutableArrayRef<int> LoMask = Mask.slice(0, 8);
- MutableArrayRef<int> HiMask = Mask.slice(8, 8);
-
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -9560,10 +8789,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Update the lane map based on the mapping we ended up with.
LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
}
- V1 = DAG.getNode(
- ISD::BITCAST, DL, MVT::v16i8,
- DAG.getVectorShuffle(MVT::v8i16, DL,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ V1 = DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
@@ -9581,46 +8809,26 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(PostDupI16Shuffle[i / 2] == MappedMask &&
"Conflicting entrties in the original shuffle!");
}
- return DAG.getNode(
- ISD::BITCAST, DL, MVT::v16i8,
- DAG.getVectorShuffle(MVT::v8i16, DL,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+ return DAG.getBitcast(
+ MVT::v16i8,
+ DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
};
if (SDValue V = tryToWidenViaDuplication())
return V;
}
- // Check whether an interleaving lowering is likely to be more efficient.
- // This isn't perfect but it is a strong heuristic that tends to work well on
- // the kinds of shuffles that show up in practice.
- //
- // FIXME: We need to handle other interleaving widths (i16, i32, ...).
- if (shouldLowerAsInterleaving(Mask)) {
- int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 0 && M < 8) || (M >= 16 && M < 24);
- });
- int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 8 && M < 16) || M >= 24;
- });
- int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- bool UnpackLo = NumLoHalf >= NumHiHalf;
- MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
- MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
- for (int i = 0; i < 8; ++i) {
- TargetEMask[i] = Mask[2 * i];
- TargetOMask[i] = Mask[2 * i + 1];
- }
-
- SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
- SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
-
- return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- MVT::v16i8, Evens, Odds);
- }
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // High half.
+ 4, 20, 5, 21, 6, 22, 7, 23}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 8, 24, 9, 25, 10, 26, 11, 27,
+ // High half.
+ 12, 28, 13, 29, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
@@ -9636,52 +8844,47 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget->hasSSSE3()) {
- SDValue V1Mask[16];
- SDValue V2Mask[16];
bool V1InUse = false;
bool V2InUse = false;
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
- for (int i = 0; i < 16; ++i) {
- if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
- } else {
- const int ZeroMask = 0x80;
- int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
- int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
- if (Zeroable[i])
- V1Idx = V2Idx = ZeroMask;
- V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
- V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
- V1InUse |= (ZeroMask != V1Idx);
- V2InUse |= (ZeroMask != V2Idx);
- }
- }
+ SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+ DAG, V1InUse, V2InUse);
- if (V1InUse)
- V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
- if (V2InUse)
- V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return Unpack;
+ }
- // If we need shuffled inputs from both, blend the two.
- if (V1InUse && V2InUse)
- return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
- if (V1InUse)
- return V1; // Single inputs are easy.
- if (V2InUse)
- return V2; // Single inputs are easy.
- // Shuffling to a zeroable vector.
- return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
+ return PSHUFB;
}
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return V;
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
@@ -9701,91 +8904,76 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// We use the mask type to pick which bytes are preserved based on how many
// elements are dropped.
MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
- SDValue ByteClearMask =
- DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
- DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
+ SDValue ByteClearMask = DAG.getBitcast(
+ MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
if (!IsSingleInput)
V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
// Now pack things back together.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
- V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ V1 = DAG.getBitcast(MVT::v8i16, V1);
+ V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
for (int i = 1; i < NumEvenDrops; ++i) {
- Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+ Result = DAG.getBitcast(MVT::v8i16, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
}
return Result;
}
- int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
- auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
- MutableArrayRef<int> V1HalfBlendMask,
- MutableArrayRef<int> V2HalfBlendMask) {
- for (int i = 0; i < 8; ++i)
- if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
- V1HalfBlendMask[i] = HalfMask[i];
- HalfMask[i] = i;
- } else if (HalfMask[i] >= 16) {
- V2HalfBlendMask[i] = HalfMask[i] - 16;
- HalfMask[i] = i + 8;
- }
- };
- buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
- buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
- SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
- auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
- MutableArrayRef<int> HiBlendMask) {
- SDValue V1, V2;
- // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
- // them out and avoid using UNPCK{L,H} to extract the elements of V as
- // i16s.
- if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; }) &&
- std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; })) {
- // Use a mask to drop the high bytes.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
- DAG.getConstant(0x00FF, MVT::v8i16));
-
- // This will be a single vector shuffle instead of a blend so nuke V2.
- V2 = DAG.getUNDEF(MVT::v8i16);
-
- // Squash the masks to point directly into V1.
- for (int &M : LoBlendMask)
- if (M >= 0)
- M /= 2;
- for (int &M : HiBlendMask)
- if (M >= 0)
- M /= 2;
- } else {
- // Otherwise just unpack the low half of V into V1 and the high half into
- // V2 so that we can blend them as i16s.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
- }
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
- SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- return std::make_pair(BlendedLo, BlendedHi);
- };
- SDValue V1Lo, V1Hi, V2Lo, V2Hi;
- std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
- std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getBitcast(MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, DL, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ VLoHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getBitcast(
+ MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
@@ -9871,7 +9059,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
-/// \brief Generic routine to split ector shuffle into half-sized shuffles.
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
@@ -9892,14 +9080,43 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
MVT ScalarVT = VT.getScalarType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
- SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(SplitNumElements));
- SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(0));
- SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(SplitNumElements));
+ // Rather than splitting build-vectors, just build two narrower build
+ // vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V->getOperand(0);
+
+ MVT OrigVT = V.getSimpleValueType();
+ int OrigNumElements = OrigVT.getVectorNumElements();
+ int OrigSplitNumElements = OrigNumElements / 2;
+ MVT OrigScalarVT = OrigVT.getScalarType();
+ MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+ SDValue LoV, HiV;
+
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV) {
+ LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(0, DL));
+ HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(OrigSplitNumElements, DL));
+ } else {
+
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (int i = 0; i < OrigSplitNumElements; ++i) {
+ LoOps.push_back(BV->getOperand(i));
+ HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+ }
+ LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+ HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+ }
+ return std::make_pair(DAG.getBitcast(SplitVT, LoV),
+ DAG.getBitcast(SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
@@ -10046,7 +9263,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
int LaneSize = Mask.size() / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
- // less expensive. The flags track wether the given lane contains an element
+ // less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
bool LaneCrossing[2] = {false, false};
for (int i = 0, Size = Mask.size(); i < Size; ++i)
@@ -10071,7 +9288,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
// allow folding it into a memory operand.
unsigned PERMMask = 3 | 2 << 4;
SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
- V1, DAG.getConstant(PERMMask, MVT::i8));
+ V1, DAG.getConstant(PERMMask, DL, MVT::i8));
return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
}
@@ -10086,33 +9303,49 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ // TODO: If minimizing size and one of the inputs is a zero vector and the
+ // the zero vector has only one use, we could use a VPERM2X128 to save the
+ // instruction bytes needed to explicitly generate the zero vector.
+
// Blends are faster and handle all the non-lane-crossing cases.
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
- // Check for patterns which can be matched with a single insert of a 128-bit
- // subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(Mask, 0, 1, 0, 1);
- if (OnlyUsesV1 || isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
- }
- if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
- SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
- DAG.getIntPtrConstant(2));
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
- }
-
- // Otherwise form a 128-bit permutation.
- // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
+ bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // If either input operand is a zero vector, use VPERM2X128 because its mask
+ // allows us to replace the zero input with an implicit zero.
+ if (!IsV1Zero && !IsV2Zero) {
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ }
+
+ // Otherwise form a 128-bit permutation. After accounting for undefs,
+ // convert the 64-bit shuffle mask selection values into 128-bit
+ // selection bits by dividing the indexes by 2 and shifting into positions
+ // defined by a vperm2*128 instruction's immediate control byte.
+
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
int MaskLO = Mask[0];
if (MaskLO == SM_SentinelUndef)
MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
@@ -10122,8 +9355,29 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+
+ // If either input is a zero vector, replace it with an undef input.
+ // Shuffle mask values < 4 are selecting elements of V1.
+ // Shuffle mask values >= 4 are selecting elements of V2.
+ // Adjust each half of the permute mask by clearing the half that was
+ // selecting the zero vector and setting the zero mask bit.
+ if (IsV1Zero) {
+ V1 = DAG.getUNDEF(VT);
+ if (MaskLO < 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI < 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+ if (IsV2Zero) {
+ V2 = DAG.getUNDEF(VT);
+ if (MaskLO >= 4)
+ PermMask = (PermMask & 0xf0) | 0x08;
+ if (MaskHI >= 4)
+ PermMask = (PermMask & 0x0f) | 0x80;
+ }
+
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, MVT::i8));
+ DAG.getConstant(PermMask, DL, MVT::i8));
}
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
@@ -10190,12 +9444,12 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
}
- V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+ V1 = DAG.getBitcast(LaneVT, V1);
+ V2 = DAG.getBitcast(LaneVT, V2);
SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
// Cast it back to the type we actually want.
- LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+ LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
// Now do a simple shuffle that isn't lane crossing.
SmallVector<int, 8> NewMask;
@@ -10224,6 +9478,37 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
return true;
}
+static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+
+ // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
+ // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
+ assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
+ int NumElts = VT.getVectorNumElements();
+ bool ShufpdMask = true;
+ bool CommutableMask = true;
+ unsigned Immediate = 0;
+ for (int i = 0; i < NumElts; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ int Val = (i & 6) + NumElts * (i & 1);
+ int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
+ if (Mask[i] < Val || Mask[i] > Val + 1)
+ ShufpdMask = false;
+ if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
+ CommutableMask = false;
+ Immediate |= (Mask[i] % 2) << i;
+ }
+ if (ShufpdMask)
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+ if (CommutableMask)
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+ DAG.getConstant(Immediate, DL, MVT::i8));
+ return SDValue();
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -10245,23 +9530,27 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
Mask, Subtarget, DAG))
return Broadcast;
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
- DAG.getConstant(VPERMILPMask, MVT::i8));
+ DAG.getConstant(VPERMILPMask, DL, MVT::i8));
}
// With AVX2 we have direct support for this permutation.
if (Subtarget->hasAVX2())
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
@@ -10270,43 +9559,23 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
-
- // If we have a single input to the zero element, insert that into V1 if we
- // can do so cheaply.
- int NumV2Elements =
- std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
- if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if ((Mask[0] == -1 || Mask[0] < 2) &&
- (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
- (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
- (Mask[3] == -1 || Mask[3] >= 6)) {
- unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
- ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
- DAG.getConstant(SHUFPDMask, MVT::i8));
- }
- if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
- (Mask[1] == -1 || Mask[1] < 2) &&
- (Mask[2] == -1 || Mask[2] >= 6) &&
- (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
- unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
- ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
- DAG.getConstant(SHUFPDMask, MVT::i8));
- }
+ if (SDValue Op =
+ lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return Op;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -10353,7 +9622,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10368,25 +9637,34 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
}
- return DAG.getNode(
- ISD::BITCAST, DL, MVT::v4i64,
+ return DAG.getBitcast(
+ MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
- getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ DAG.getBitcast(MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
}
// AVX2 provides a direct instruction for permuting a single input across
// lanes.
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
@@ -10422,7 +9700,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10432,15 +9710,26 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
- getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends. We also need to squash the
@@ -10457,18 +9746,18 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
SDValue VPermMask[8];
for (int i = 0; i < 8; ++i)
VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
- : DAG.getConstant(Mask[i], MVT::i32);
+ : DAG.getConstant(Mask[i], DL, MVT::i32);
if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
return DAG.getNode(
X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
if (Subtarget->hasAVX2())
- return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
- DAG.getNode(ISD::BUILD_VECTOR, DL,
+ return DAG.getNode(
+ X86ISD::VPERMV, DL, MVT::v8f32,
+ DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL,
MVT::v8i32, VPermMask)),
- V1);
+ V1);
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
@@ -10506,12 +9795,19 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10523,22 +9819,35 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
- getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
}
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (isSingleInputShuffleMask(Mask)) {
SDValue VPermMask[8];
for (int i = 0; i < 8; ++i)
VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
- : DAG.getConstant(Mask[i], MVT::i32);
+ : DAG.getConstant(Mask[i], DL, MVT::i32);
return DAG.getNode(
X86ISD::VPERMV, DL, MVT::v8i32,
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
@@ -10570,8 +9879,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10580,19 +9896,29 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask,
- // First 128-bit lane:
- 0, 16, 1, 17, 2, 18, 3, 19,
- // Second 128-bit lane:
- 8, 24, 9, 25, 10, 26, 11, 27))
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane:
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // Second 128-bit lane:
+ 8, 24, 9, 25, 10, 26, 11, 27}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
- if (isShuffleEquivalent(Mask,
- // First 128-bit lane:
- 4, 20, 5, 21, 6, 22, 7, 23,
- // Second 128-bit lane:
- 12, 28, 13, 29, 14, 30, 15, 31))
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane:
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ // Second 128-bit lane:
+ 12, 28, 13, 29, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
// There are no generalized cross-lane shuffle operations available on i16
// element types.
@@ -10600,6 +9926,15 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
Mask, DAG);
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
@@ -10609,15 +9944,14 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
int M = i < 8 ? Mask[i] : Mask[i] - 8;
assert(M >= 0 && M < 8 && "Invalid single-input mask!");
- PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
- PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
+ PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8);
+ PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8);
}
- return DAG.getNode(
- ISD::BITCAST, DL, MVT::v16i16,
- DAG.getNode(
- X86ISD::PSHUFB, DL, MVT::v32i8,
- DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+ return DAG.getBitcast(MVT::v16i16,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8,
+ DAG.getBitcast(MVT::v32i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v32i8, PSHUFBMask)));
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -10645,8 +9979,15 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
@@ -10658,20 +9999,30 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// Note that these are repeated 128-bit lane unpacks, not unpacks across all
// 256-bit lanes.
if (isShuffleEquivalent(
- Mask,
- // First 128-bit lane:
- 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
- // Second 128-bit lane:
- 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+ V1, V2, Mask,
+ {// First 128-bit lane:
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+ // Second 128-bit lane:
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
if (isShuffleEquivalent(
- Mask,
- // First 128-bit lane:
- 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
- // Second 128-bit lane:
- 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+ V1, V2, Mask,
+ {// First 128-bit lane:
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+ // Second 128-bit lane:
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
// There are no generalized cross-lane shuffle operations available on i8
// element types.
@@ -10684,7 +10035,8 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
PSHUFBMask[i] =
Mask[i] < 0
? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
+ : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL,
+ MVT::i8);
return DAG.getNode(
X86ISD::PSHUFB, DL, MVT::v32i8, V1,
@@ -10713,6 +10065,18 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = VT.getVectorNumElements();
+ int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) {
+ return M >= NumElts;
+ });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
// There is a really nice hard cut-over between AVX1 and AVX2 that means we can
// check for those subtargets here and avoid much of the subtarget querying in
// the per-vector-type lowering routines. With AVX1 we have essentially *zero*
@@ -10727,10 +10091,9 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
VT.getVectorNumElements());
- V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+ V1 = DAG.getBitcast(FpVT, V1);
+ V2 = DAG.getBitcast(FpVT, V2);
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
}
switch (VT.SimpleTy) {
@@ -10752,98 +10115,144 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
}
}
-/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
- assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
-
- // X86 has dedicated unpack instructions that can handle specific blend
- // operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+ assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN");
+ // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right)
+ int AlignVal = -1;
+ for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) {
+ if (Mask[i] < 0)
+ continue;
+ if (Mask[i] < i)
+ return SDValue();
+ if (AlignVal == -1)
+ AlignVal = Mask[i] - i;
+ else if (Mask[i] - i != AlignVal)
+ return SDValue();
+ }
+ // Vector source operands should be swapped
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
+ DAG.getConstant(AlignVal, DL, MVT::i8));
}
-/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
- assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask,
- 0, 16, 1, 17, 4, 20, 5, 21,
- 8, 24, 9, 25, 12, 28, 13, 29))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
- if (isShuffleEquivalent(Mask,
- 2, 18, 3, 19, 6, 22, 7, 23,
- 10, 26, 11, 27, 14, 30, 15, 31))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+ assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+ MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
+
+ SmallVector<SDValue, 32> VPermMask;
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+ VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
+ DAG.getConstant(Mask[i], DL,MaskEltVT));
+ SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
+ VPermMask);
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+
+ return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
}
-/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
-static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
- assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ MVT VT = Op.getSimpleValueType();
+ assert((V1.getSimpleValueType() == MVT::v8f64 ||
+ V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
+ assert((V2.getSimpleValueType() == MVT::v8f64 ||
+ V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+ if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
+ return Op;
+
+ if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG))
+ return Op;
+
+ // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
+ if (isSingleInputShuffleMask(Mask)) {
+ if (!is128BitLaneCrossingShuffleMask(VT, Mask))
+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
+ get1bitLaneShuffleImm8ForMask(Mask, DL, DAG));
+
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+ return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+ }
+ return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
- assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert((V1.getSimpleValueType() == MVT::v16i32 ||
+ V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
+ assert((V2.getSimpleValueType() == MVT::v16i32 ||
+ V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask,
- 0, 16, 1, 17, 4, 20, 5, 21,
- 8, 24, 9, 25, 12, 28, 13, 29))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
- if (isShuffleEquivalent(Mask,
- 2, 18, 3, 19, 6, 22, 7, 23,
- 10, 26, 11, 27, 14, 30, 15, 31))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
+
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+ 12, 12, 14, 14}))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11,
+ 13, 13, 15, 15}))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1);
- // FIXME: Implement direct support for this type!
- return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
+ if (isSingleInputShuffleMask(Mask)) {
+ unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI;
+ return DAG.getNode(Opc, DL, VT, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
+ }
+
+ for (int i = 0; i < 4; ++i)
+ if (RepeatedMask[i] >= 16)
+ RepeatedMask[i] -= 12;
+ return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG);
+ }
+
+ if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
+ return Op;
+
+ return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10893,8 +10302,8 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
"Cannot lower 512-bit vectors w/ basic ISA!");
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have supprot for
@@ -10903,13 +10312,11 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
- case MVT::v16f32:
- return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v16f32:
case MVT::v16i32:
- return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i16:
if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
@@ -10970,6 +10377,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
}
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.all())
+ return getZeroVector(VT, Subtarget, DAG, dl);
+
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
@@ -10984,10 +10398,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
// Make sure that the new vector type is legal. For example, v2f64 isn't
// legal on SSE1.
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
- V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+ V1 = DAG.getBitcast(NewVT, V1);
+ V2 = DAG.getBitcast(NewVT, V2);
+ return DAG.getBitcast(
+ VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
}
}
@@ -11057,1586 +10471,6 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
llvm_unreachable("Unimplemented!");
}
-
-//===----------------------------------------------------------------------===//
-// Legacy vector shuffle lowering
-//
-// This code is the legacy code handling vector shuffles until the above
-// replaces its functionality and performance.
-//===----------------------------------------------------------------------===//
-
-static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
- bool hasInt256, unsigned *MaskOut = nullptr) {
- MVT EltVT = VT.getVectorElementType();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return false;
-
- if (!hasSSE41 || EltVT == MVT::i8)
- return false;
- if (!hasInt256 && VT == MVT::v16i16)
- return false;
-
- unsigned MaskValue = 0;
- unsigned NumElems = VT.getVectorNumElements();
- // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
- unsigned NumLanes = (NumElems - 1) / 8 + 1;
- unsigned NumElemsInLane = NumElems / NumLanes;
-
- // Blend for v16i16 should be symetric for the both lanes.
- for (unsigned i = 0; i < NumElemsInLane; ++i) {
-
- int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
- int EltIdx = MaskVals[i];
-
- if ((EltIdx < 0 || EltIdx == (int)i) &&
- (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
- continue;
-
- if (((unsigned)EltIdx == (i + NumElems)) &&
- (SndLaneEltIdx < 0 ||
- (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
- MaskValue |= (1 << i);
- else
- return false;
- }
-
- if (MaskOut)
- *MaskOut = MaskValue;
- return true;
-}
-
-// Try to lower a shuffle node into a simple blend instruction.
-// This function assumes isBlendMask returns true for this
-// SuffleVectorSDNode
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
- unsigned MaskValue,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- MVT EltVT = VT.getVectorElementType();
- assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
- Subtarget->hasInt256() && "Trying to lower a "
- "VECTOR_SHUFFLE to a Blend but "
- "with the wrong mask"));
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- unsigned NumElems = VT.getVectorNumElements();
-
- // Convert i32 vectors to floating point if it is not AVX2.
- // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- MVT BlendVT = VT;
- if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
- V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
- }
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
- DAG.getConstant(MaskValue, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
-}
-
-/// In vector type \p VT, return true if the element at index \p InputIdx
-/// falls on a different 128-bit lane than \p OutputIdx.
-static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
- unsigned OutputIdx) {
- unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
-}
-
-/// Generate a PSHUFB if possible. Selects elements from \p V1 according to
-/// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to
-/// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p
-/// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
-/// zero.
-static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
- SelectionDAG &DAG) {
- MVT VT = V1.getSimpleValueType();
- assert(VT.is128BitVector() || VT.is256BitVector());
-
- MVT EltVT = VT.getVectorElementType();
- unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
- unsigned NumElts = VT.getVectorNumElements();
-
- SmallVector<SDValue, 32> PshufbMask;
- for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
- int InputIdx = MaskVals[OutputIdx];
- unsigned InputByteIdx;
-
- if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
- InputByteIdx = 0x80;
- else {
- // Cross lane is not allowed.
- if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
- return SDValue();
- InputByteIdx = InputIdx * EltSizeInBytes;
- // Index is an byte offset within the 128-bit lane.
- InputByteIdx &= 0xf;
- }
-
- for (unsigned j = 0; j < EltSizeInBytes; ++j) {
- PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
- if (InputByteIdx != 0x80)
- ++InputByteIdx;
- }
- }
-
- MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
- if (ShufVT != VT)
- V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
- return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
-}
-
-// v8i16 shuffles - Prefer shuffles in the following order:
-// 1. [all] pshuflw, pshufhw, optional move
-// 2. [ssse3] 1 x pshufb
-// 3. [ssse3] 2 x pshufb + 1 x por
-// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-static SDValue
-LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- SmallVector<int, 8> MaskVals;
-
- // Determine if more than 1 of the words in each of the low and high quadwords
- // of the result come from the same quadword of one of the two inputs. Undef
- // mask values count as coming from any quadword, for better codegen.
- //
- // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
- // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
- unsigned LoQuad[] = { 0, 0, 0, 0 };
- unsigned HiQuad[] = { 0, 0, 0, 0 };
- // Indices of quads used.
- std::bitset<4> InputQuads;
- for (unsigned i = 0; i < 8; ++i) {
- unsigned *Quad = i < 4 ? LoQuad : HiQuad;
- int EltIdx = SVOp->getMaskElt(i);
- MaskVals.push_back(EltIdx);
- if (EltIdx < 0) {
- ++Quad[0];
- ++Quad[1];
- ++Quad[2];
- ++Quad[3];
- continue;
- }
- ++Quad[EltIdx / 4];
- InputQuads.set(EltIdx / 4);
- }
-
- int BestLoQuad = -1;
- unsigned MaxQuad = 1;
- for (unsigned i = 0; i < 4; ++i) {
- if (LoQuad[i] > MaxQuad) {
- BestLoQuad = i;
- MaxQuad = LoQuad[i];
- }
- }
-
- int BestHiQuad = -1;
- MaxQuad = 1;
- for (unsigned i = 0; i < 4; ++i) {
- if (HiQuad[i] > MaxQuad) {
- BestHiQuad = i;
- MaxQuad = HiQuad[i];
- }
- }
-
- // For SSSE3, If all 8 words of the result come from only 1 quadword of each
- // of the two input vectors, shuffle them into one input vector so only a
- // single pshufb instruction is necessary. If there are more than 2 input
- // quads, disable the next transformation since it does not help SSSE3.
- bool V1Used = InputQuads[0] || InputQuads[1];
- bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3()) {
- if (InputQuads.count() == 2 && V1Used && V2Used) {
- BestLoQuad = InputQuads[0] ? 0 : 1;
- BestHiQuad = InputQuads[2] ? 2 : 3;
- }
- if (InputQuads.count() > 2) {
- BestLoQuad = -1;
- BestHiQuad = -1;
- }
- }
-
- // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
- // the shuffle mask. If a quad is scored as -1, that means that it contains
- // words from all 4 input quadwords.
- SDValue NewV;
- if (BestLoQuad >= 0 || BestHiQuad >= 0) {
- int MaskV[] = {
- BestLoQuad < 0 ? 0 : BestLoQuad,
- BestHiQuad < 0 ? 1 : BestHiQuad
- };
- NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
- NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
-
- // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
- // source words for the shuffle, to aid later transformations.
- bool AllWordsInNewV = true;
- bool InOrder[2] = { true, true };
- for (unsigned i = 0; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx != (int)i)
- InOrder[i/4] = false;
- if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
- continue;
- AllWordsInNewV = false;
- break;
- }
-
- bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
- if (AllWordsInNewV) {
- for (int i = 0; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx < 0)
- continue;
- idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
- if ((idx != i) && idx < 4)
- pshufhw = false;
- if ((idx != i) && idx > 3)
- pshuflw = false;
- }
- V1 = NewV;
- V2Used = false;
- BestLoQuad = 0;
- BestHiQuad = 1;
- }
-
- // If we've eliminated the use of V2, and the new mask is a pshuflw or
- // pshufhw, that's as cheap as it gets. Return the new shuffle.
- if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
- unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
- unsigned TargetMask = 0;
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
- DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
- getShufflePSHUFLWImmediate(SVOp);
- V1 = NewV.getOperand(0);
- return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
- }
- }
-
- // Promote splats to a larger type which usually leads to more efficient code.
- // FIXME: Is this true if pshufb is available?
- if (SVOp->isSplat())
- return PromoteSplat(SVOp, DAG);
-
- // If we have SSSE3, and all words of the result are from 1 input vector,
- // case 2 is generated, otherwise case 3 is generated. If no SSSE3
- // is present, fall back to case 4.
- if (Subtarget->hasSSSE3()) {
- SmallVector<SDValue,16> pshufbMask;
-
- // If we have elements from both input vectors, set the high bit of the
- // shuffle mask element to zero out elements that come from V2 in the V1
- // mask, and elements that come from V1 in the V2 mask, so that the two
- // results can be OR'd together.
- bool TwoInputs = V1Used && V2Used;
- V1 = getPSHUFB(MaskVals, V1, dl, DAG);
- if (!TwoInputs)
- return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
-
- // Calculate the shuffle mask for the second input, shuffle it, and
- // OR it with the first shuffled input.
- CommuteVectorShuffleMask(MaskVals, 8);
- V2 = getPSHUFB(MaskVals, V2, dl, DAG);
- V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
- return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
- }
-
- // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
- // and update MaskVals with new element order.
- std::bitset<8> InOrder;
- if (BestLoQuad >= 0) {
- int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
- for (int i = 0; i != 4; ++i) {
- int idx = MaskVals[i];
- if (idx < 0) {
- InOrder.set(i);
- } else if ((idx / 4) == BestLoQuad) {
- MaskV[i] = idx & 3;
- InOrder.set(i);
- }
- }
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
- &MaskV[0]);
-
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
- NewV.getOperand(0),
- getShufflePSHUFLWImmediate(SVOp), DAG);
- }
- }
-
- // If BestHi >= 0, generate a pshufhw to put the high elements in order,
- // and update MaskVals with the new element order.
- if (BestHiQuad >= 0) {
- int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
- for (unsigned i = 4; i != 8; ++i) {
- int idx = MaskVals[i];
- if (idx < 0) {
- InOrder.set(i);
- } else if ((idx / 4) == BestHiQuad) {
- MaskV[i] = (idx & 3) + 4;
- InOrder.set(i);
- }
- }
- NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
- &MaskV[0]);
-
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
- NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
- NewV.getOperand(0),
- getShufflePSHUFHWImmediate(SVOp), DAG);
- }
- }
-
- // In case BestHi & BestLo were both -1, which means each quadword has a word
- // from each of the four input quadwords, calculate the InOrder bitvector now
- // before falling through to the insert/extract cleanup.
- if (BestLoQuad == -1 && BestHiQuad == -1) {
- NewV = V1;
- for (int i = 0; i != 8; ++i)
- if (MaskVals[i] < 0 || MaskVals[i] == i)
- InOrder.set(i);
- }
-
- // The other elements are put in the right place using pextrw and pinsrw.
- for (unsigned i = 0; i != 8; ++i) {
- if (InOrder[i])
- continue;
- int EltIdx = MaskVals[i];
- if (EltIdx < 0)
- continue;
- SDValue ExtOp = (EltIdx < 8) ?
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
- DAG.getIntPtrConstant(EltIdx)) :
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
- DAG.getIntPtrConstant(EltIdx - 8));
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
- DAG.getIntPtrConstant(i));
- }
- return NewV;
-}
-
-/// \brief v16i16 shuffles
-///
-/// FIXME: We only support generation of a single pshufb currently. We can
-/// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
-/// well (e.g 2 x pshufb + 1 x por).
-static SDValue
-LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
-
- if (V2.getOpcode() != ISD::UNDEF)
- return SDValue();
-
- SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
- return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-// v16i8 shuffles - Prefer shuffles in the following order:
-// 1. [ssse3] 1 x pshufb
-// 2. [ssse3] 2 x pshufb + 1 x por
-// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
-static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
- const X86Subtarget* Subtarget,
- SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- ArrayRef<int> MaskVals = SVOp->getMask();
-
- // Promote splats to a larger type which usually leads to more efficient code.
- // FIXME: Is this true if pshufb is available?
- if (SVOp->isSplat())
- return PromoteSplat(SVOp, DAG);
-
- // If we have SSSE3, case 1 is generated when all result bytes come from
- // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
- // present, fall back to case 3.
-
- // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (Subtarget->hasSSSE3()) {
- SmallVector<SDValue,16> pshufbMask;
-
- // If all result elements are from one input vector, then only translate
- // undef mask values to 0x80 (zero out result) in the pshufb mask.
- //
- // Otherwise, we have elements from both input vectors, and must zero out
- // elements that come from V2 in the first mask, and V1 in the second mask
- // so that we can OR them together.
- for (unsigned i = 0; i != 16; ++i) {
- int EltIdx = MaskVals[i];
- if (EltIdx < 0 || EltIdx >= 16)
- EltIdx = 0x80;
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- }
- V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, pshufbMask));
-
- // As PSHUFB will zero elements with negative indices, it's safe to ignore
- // the 2nd operand if it's undefined or zero.
- if (V2.getOpcode() == ISD::UNDEF ||
- ISD::isBuildVectorAllZeros(V2.getNode()))
- return V1;
-
- // Calculate the shuffle mask for the second input, shuffle it, and
- // OR it with the first shuffled input.
- pshufbMask.clear();
- for (unsigned i = 0; i != 16; ++i) {
- int EltIdx = MaskVals[i];
- EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- }
- V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, dl,
- MVT::v16i8, pshufbMask));
- return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
- }
-
- // No SSSE3 - Calculate in place words and then fix all out of place words
- // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
- // the 16 different words that comprise the two doublequadword input vectors.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
- SDValue NewV = V1;
- for (int i = 0; i != 8; ++i) {
- int Elt0 = MaskVals[i*2];
- int Elt1 = MaskVals[i*2+1];
-
- // This word of the result is all undef, skip it.
- if (Elt0 < 0 && Elt1 < 0)
- continue;
-
- // This word of the result is already in the correct place, skip it.
- if ((Elt0 == i*2) && (Elt1 == i*2+1))
- continue;
-
- SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
- SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
- SDValue InsElt;
-
- // If Elt0 and Elt1 are defined, are consecutive, and can be load
- // using a single extract together, load it and store it.
- if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
- InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
- DAG.getIntPtrConstant(Elt1 / 2));
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
- DAG.getIntPtrConstant(i));
- continue;
- }
-
- // If Elt1 is defined, extract it from the appropriate source. If the
- // source byte is not also odd, shift the extracted word left 8 bits
- // otherwise clear the bottom 8 bits if we need to do an or.
- if (Elt1 >= 0) {
- InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
- DAG.getIntPtrConstant(Elt1 / 2));
- if ((Elt1 & 1) == 0)
- InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
- DAG.getConstant(8,
- TLI.getShiftAmountTy(InsElt.getValueType())));
- else if (Elt0 >= 0)
- InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
- DAG.getConstant(0xFF00, MVT::i16));
- }
- // If Elt0 is defined, extract it from the appropriate source. If the
- // source byte is not also even, shift the extracted word right 8 bits. If
- // Elt1 was also defined, OR the extracted values together before
- // inserting them in the result.
- if (Elt0 >= 0) {
- SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
- Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
- if ((Elt0 & 1) != 0)
- InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
- DAG.getConstant(8,
- TLI.getShiftAmountTy(InsElt0.getValueType())));
- else if (Elt1 >= 0)
- InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
- DAG.getConstant(0x00FF, MVT::i16));
- InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
- : InsElt0;
- }
- NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
- DAG.getIntPtrConstant(i));
- }
- return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
-}
-
-// v32i8 shuffles - Translate to VPSHUFB if possible.
-static
-SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
-
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
- bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
- // VPSHUFB may be generated if
- // (1) one of input vector is undefined or zeroinitializer.
- // The mask value 0x80 puts 0 in the corresponding slot of the vector.
- // And (2) the mask indexes don't cross the 128-bit lane.
- if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
- (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
- return SDValue();
-
- if (V1IsAllZero && !V2IsAllZero) {
- CommuteVectorShuffleMask(MaskVals, 32);
- V1 = V2;
- }
- return getPSHUFB(MaskVals, V1, dl, DAG);
-}
-
-/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
-/// done when every pair / quad of shuffle mask elements point to elements in
-/// the right sequence. e.g.
-/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
-static
-SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
- SelectionDAG &DAG) {
- MVT VT = SVOp->getSimpleValueType(0);
- SDLoc dl(SVOp);
- unsigned NumElems = VT.getVectorNumElements();
- MVT NewVT;
- unsigned Scale;
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected!");
- case MVT::v2i64:
- case MVT::v2f64:
- return SDValue(SVOp, 0);
- case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
- case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
- case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
- case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
- case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
- case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
- }
-
- SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i != NumElems; i += Scale) {
- int StartIdx = -1;
- for (unsigned j = 0; j != Scale; ++j) {
- int EltIdx = SVOp->getMaskElt(i+j);
- if (EltIdx < 0)
- continue;
- if (StartIdx < 0)
- StartIdx = (EltIdx / Scale);
- if (EltIdx != (int)(StartIdx*Scale + j))
- return SDValue();
- }
- MaskVec.push_back(StartIdx);
- }
-
- SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
- SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
- return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
-}
-
-/// getVZextMovL - Return a zero-extending vector move low node.
-///
-static SDValue getVZextMovL(MVT VT, MVT OpVT,
- SDValue SrcOp, SelectionDAG &DAG,
- const X86Subtarget *Subtarget, SDLoc dl) {
- if (VT == MVT::v2f64 || VT == MVT::v4f32) {
- LoadSDNode *LD = nullptr;
- if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
- LD = dyn_cast<LoadSDNode>(SrcOp);
- if (!LD) {
- // movssrr and movsdrr do not clear top bits. Try to use movd, movq
- // instead.
- MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
- if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
- SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
- SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
- // PR2108
- OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- OpVT,
- SrcOp.getOperand(0)
- .getOperand(0))));
- }
- }
- }
-
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
- DAG.getNode(ISD::BITCAST, dl,
- OpVT, SrcOp)));
-}
-
-/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
-/// which could not be matched by any known target speficic shuffle
-static SDValue
-LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-
- SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- MVT VT = SVOp->getSimpleValueType(0);
-
- unsigned NumElems = VT.getVectorNumElements();
- unsigned NumLaneElems = NumElems / 2;
-
- SDLoc dl(SVOp);
- MVT EltVT = VT.getVectorElementType();
- MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
- SDValue Output[2];
-
- SmallVector<int, 16> Mask;
- for (unsigned l = 0; l < 2; ++l) {
- // Build a shuffle mask for the output, discovering on the fly which
- // input vectors to use as shuffle operands (recorded in InputUsed).
- // If building a suitable shuffle vector proves too hard, then bail
- // out with UseBuildVector set.
- bool UseBuildVector = false;
- int InputUsed[2] = { -1, -1 }; // Not yet discovered.
- unsigned LaneStart = l * NumLaneElems;
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- // The mask element. This indexes into the input.
- int Idx = SVOp->getMaskElt(i+LaneStart);
- if (Idx < 0) {
- // the mask element does not index into any input vector.
- Mask.push_back(-1);
- continue;
- }
-
- // The input vector this mask element indexes into.
- int Input = Idx / NumLaneElems;
-
- // Turn the index into an offset from the start of the input vector.
- Idx -= Input * NumLaneElems;
-
- // Find or create a shuffle vector operand to hold this input.
- unsigned OpNo;
- for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
- if (InputUsed[OpNo] == Input)
- // This input vector is already an operand.
- break;
- if (InputUsed[OpNo] < 0) {
- // Create a new operand for this input vector.
- InputUsed[OpNo] = Input;
- break;
- }
- }
-
- if (OpNo >= array_lengthof(InputUsed)) {
- // More than two input vectors used! Give up on trying to create a
- // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
- UseBuildVector = true;
- break;
- }
-
- // Add the mask index for the new shuffle vector.
- Mask.push_back(Idx + OpNo * NumLaneElems);
- }
-
- if (UseBuildVector) {
- SmallVector<SDValue, 16> SVOps;
- for (unsigned i = 0; i != NumLaneElems; ++i) {
- // The mask element. This indexes into the input.
- int Idx = SVOp->getMaskElt(i+LaneStart);
- if (Idx < 0) {
- SVOps.push_back(DAG.getUNDEF(EltVT));
- continue;
- }
-
- // The input vector this mask element indexes into.
- int Input = Idx / NumElems;
-
- // Turn the index into an offset from the start of the input vector.
- Idx -= Input * NumElems;
-
- // Extract the vector element by hand.
- SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
- SVOp->getOperand(Input),
- DAG.getIntPtrConstant(Idx)));
- }
-
- // Construct the output using a BUILD_VECTOR.
- Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
- } else if (InputUsed[0] < 0) {
- // No input vectors were used! The result is undefined.
- Output[l] = DAG.getUNDEF(NVT);
- } else {
- SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
- (InputUsed[0] % 2) * NumLaneElems,
- DAG, dl);
- // If only one input was used, use an undefined vector for the other.
- SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
- Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
- (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
- // At least one input vector was used. Create a new shuffle vector.
- Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
- }
-
- Mask.clear();
- }
-
- // Concatenate the result back
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
-}
-
-/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
-/// 4 elements, and match them with several different shuffle types.
-static SDValue
-LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- SDLoc dl(SVOp);
- MVT VT = SVOp->getSimpleValueType(0);
-
- assert(VT.is128BitVector() && "Unsupported vector size");
-
- std::pair<int, int> Locs[4];
- int Mask1[] = { -1, -1, -1, -1 };
- SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
-
- unsigned NumHi = 0;
- unsigned NumLo = 0;
- for (unsigned i = 0; i != 4; ++i) {
- int Idx = PermMask[i];
- if (Idx < 0) {
- Locs[i] = std::make_pair(-1, -1);
- } else {
- assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
- if (Idx < 4) {
- Locs[i] = std::make_pair(0, NumLo);
- Mask1[NumLo] = Idx;
- NumLo++;
- } else {
- Locs[i] = std::make_pair(1, NumHi);
- if (2+NumHi < 4)
- Mask1[2+NumHi] = Idx;
- NumHi++;
- }
- }
- }
-
- if (NumLo <= 2 && NumHi <= 2) {
- // If no more than two elements come from either vector. This can be
- // implemented with two shuffles. First shuffle gather the elements.
- // The second shuffle, which takes the first shuffle as both of its
- // vector operands, put the elements into the right order.
- V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
- int Mask2[] = { -1, -1, -1, -1 };
-
- for (unsigned i = 0; i != 4; ++i)
- if (Locs[i].first != -1) {
- unsigned Idx = (i < 2) ? 0 : 4;
- Idx += Locs[i].first * 2 + Locs[i].second;
- Mask2[i] = Idx;
- }
-
- return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
- }
-
- if (NumLo == 3 || NumHi == 3) {
- // Otherwise, we must have three elements from one vector, call it X, and
- // one element from the other, call it Y. First, use a shufps to build an
- // intermediate vector with the one element from Y and the element from X
- // that will be in the same half in the final destination (the indexes don't
- // matter). Then, use a shufps to build the final vector, taking the half
- // containing the element from Y from the intermediate, and the other half
- // from X.
- if (NumHi == 3) {
- // Normalize it so the 3 elements come from V1.
- CommuteVectorShuffleMask(PermMask, 4);
- std::swap(V1, V2);
- }
-
- // Find the element from V2.
- unsigned HiIndex;
- for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
- int Val = PermMask[HiIndex];
- if (Val < 0)
- continue;
- if (Val >= 4)
- break;
- }
-
- Mask1[0] = PermMask[HiIndex];
- Mask1[1] = -1;
- Mask1[2] = PermMask[HiIndex^1];
- Mask1[3] = -1;
- V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-
- if (HiIndex >= 2) {
- Mask1[0] = PermMask[0];
- Mask1[1] = PermMask[1];
- Mask1[2] = HiIndex & 1 ? 6 : 4;
- Mask1[3] = HiIndex & 1 ? 4 : 6;
- return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
- }
-
- Mask1[0] = HiIndex & 1 ? 2 : 0;
- Mask1[1] = HiIndex & 1 ? 0 : 2;
- Mask1[2] = PermMask[2];
- Mask1[3] = PermMask[3];
- if (Mask1[2] >= 0)
- Mask1[2] += 4;
- if (Mask1[3] >= 0)
- Mask1[3] += 4;
- return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
- }
-
- // Break it into (shuffle shuffle_hi, shuffle_lo).
- int LoMask[] = { -1, -1, -1, -1 };
- int HiMask[] = { -1, -1, -1, -1 };
-
- int *MaskPtr = LoMask;
- unsigned MaskIdx = 0;
- unsigned LoIdx = 0;
- unsigned HiIdx = 2;
- for (unsigned i = 0; i != 4; ++i) {
- if (i == 2) {
- MaskPtr = HiMask;
- MaskIdx = 1;
- LoIdx = 0;
- HiIdx = 2;
- }
- int Idx = PermMask[i];
- if (Idx < 0) {
- Locs[i] = std::make_pair(-1, -1);
- } else if (Idx < 4) {
- Locs[i] = std::make_pair(MaskIdx, LoIdx);
- MaskPtr[LoIdx] = Idx;
- LoIdx++;
- } else {
- Locs[i] = std::make_pair(MaskIdx, HiIdx);
- MaskPtr[HiIdx] = Idx;
- HiIdx++;
- }
- }
-
- SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
- SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
- int MaskOps[] = { -1, -1, -1, -1 };
- for (unsigned i = 0; i != 4; ++i)
- if (Locs[i].first != -1)
- MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
- return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
-}
-
-static bool MayFoldVectorLoad(SDValue V) {
- while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
- V = V.getOperand(0);
-
- if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
- V = V.getOperand(0);
- if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
- V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
- // BUILD_VECTOR (load), undef
- V = V.getOperand(0);
-
- return MayFoldLoad(V);
-}
-
-static
-SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- // Canonizalize to v2f64.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
- V1, DAG));
-}
-
-static
-SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
- bool HasSSE2) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
- assert(VT != MVT::v2i64 && "unsupported shuffle type");
-
- if (HasSSE2 && VT == MVT::v2f64)
- return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
-
- // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
- return DAG.getNode(ISD::BITCAST, dl, VT,
- getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
- DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
- DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
-}
-
-static
-SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
- assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
- "unsupported shuffle type");
-
- if (V2.getOpcode() == ISD::UNDEF)
- V2 = V1;
-
- // v4i32 or v4f32
- return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
-}
-
-static
-SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
- unsigned NumElems = VT.getVectorNumElements();
-
- // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
- // operand of these instructions is only memory, so check if there's a
- // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
- // same masks.
- bool CanFoldLoad = false;
-
- // Trivial case, when V2 comes from a load.
- if (MayFoldVectorLoad(V2))
- CanFoldLoad = true;
-
- // When V1 is a load, it can be folded later into a store in isel, example:
- // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
- // turns into:
- // (MOVLPSmr addr:$src1, VR128:$src2)
- // So, recognize this potential and also use MOVLPS or MOVLPD
- else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
- CanFoldLoad = true;
-
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- if (CanFoldLoad) {
- if (HasSSE2 && NumElems == 2)
- return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
-
- if (NumElems == 4)
- // If we don't care about the second element, proceed to use movss.
- if (SVOp->getMaskElt(1) != -1)
- return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
- }
-
- // movl and movlp will both match v2i64, but v2i64 is never matched by
- // movl earlier because we make it strict to avoid messing with the movlp load
- // folding logic (see the code above getMOVLP call). Match it here then,
- // this is horrible, but will stay like this until we move all shuffle
- // matching to x86 specific nodes. Note that for the 1st condition all
- // types are matched with movsd.
- if (HasSSE2) {
- // FIXME: isMOVLMask should be checked and matched before getMOVLP,
- // as to remove this logic from here, as much as possible
- if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
- return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
- }
-
- assert(VT != MVT::v4i32 && "unsupported shuffle type");
-
- // Invert the operand order and use SHUFPS to match it.
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
-}
-
-static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
- SelectionDAG &DAG) {
- SDLoc dl(Load);
- MVT VT = Load->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue Addr = Load->getOperand(1);
- SDValue NewAddr = DAG.getNode(
- ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
- DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
-
- SDValue NewLoad =
- DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
- DAG.getMachineFunction().getMachineMemOperand(
- Load->getMemOperand(), 0, EVT.getStoreSize()));
- return NewLoad;
-}
-
-// It is only safe to call this function if isINSERTPSMask is true for
-// this shufflevector mask.
-static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
- SelectionDAG &DAG) {
- // Generate an insertps instruction when inserting an f32 from memory onto a
- // v4f32 or when copying a member from one v4f32 to another.
- // We also use it for transferring i32 from one register to another,
- // since it simply copies the same bits.
- // If we're transferring an i32 from memory to a specific element in a
- // register, we output a generic DAG that will match the PINSRD
- // instruction.
- MVT VT = SVOp->getSimpleValueType(0);
- MVT EVT = VT.getVectorElementType();
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
- auto Mask = SVOp->getMask();
- assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
- "unsupported vector type for insertps/pinsrd");
-
- auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
- auto FromV2Predicate = [](const int &i) { return i >= 4; };
- int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
-
- SDValue From;
- SDValue To;
- unsigned DestIndex;
- if (FromV1 == 1) {
- From = V1;
- To = V2;
- DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
- Mask.begin();
-
- // If we have 1 element from each vector, we have to check if we're
- // changing V1's element's place. If so, we're done. Otherwise, we
- // should assume we're changing V2's element's place and behave
- // accordingly.
- int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
- assert(DestIndex <= INT32_MAX && "truncated destination index");
- if (FromV1 == FromV2 &&
- static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
- From = V2;
- To = V1;
- DestIndex =
- std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
- }
- } else {
- assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
- "More than one element from V1 and from V2, or no elements from one "
- "of the vectors. This case should not have returned true from "
- "isINSERTPSMask");
- From = V2;
- To = V1;
- DestIndex =
- std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
- }
-
- // Get an index into the source vector in the range [0,4) (the mask is
- // in the range [0,8) because it can address V1 and V2)
- unsigned SrcIndex = Mask[DestIndex] % 4;
- if (MayFoldLoad(From)) {
- // Trivial case, when From comes from a load and is only used by the
- // shuffle. Make it use insertps from the vector that we need from that
- // load.
- SDValue NewLoad =
- NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
- if (!NewLoad.getNode())
- return SDValue();
-
- if (EVT == MVT::f32) {
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector =
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
- SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
- InsertpsMask);
- } else { // EVT == MVT::i32
- // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
- // instruction, to match the PINSRD instruction, which loads an i32 to a
- // certain vector element.
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
- DAG.getConstant(DestIndex, MVT::i32));
- }
- }
-
- // Vector-element-to-vector
- SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
-}
-
-// Reduce a vector shuffle to zext.
-static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- // PMOVZX is only available from SSE41.
- if (!Subtarget->hasSSE41())
- return SDValue();
-
- MVT VT = Op.getSimpleValueType();
-
- // Only AVX2 support 256-bit vector integer extending.
- if (!Subtarget->hasInt256() && VT.is256BitVector())
- return SDValue();
-
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDLoc DL(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extending is an unary operation and the element type of the source vector
- // won't be equal to or larger than i64.
- if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
- VT.getVectorElementType() == MVT::i64)
- return SDValue();
-
- // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
- unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
- while ((1U << Shift) < NumElems) {
- if (SVOp->getMaskElt(1U << Shift) == 1)
- break;
- Shift += 1;
- // The maximal ratio is 8, i.e. from i8 to i64.
- if (Shift > 3)
- return SDValue();
- }
-
- // Check the shuffle mask.
- unsigned Mask = (1U << Shift) - 1;
- for (unsigned i = 0; i != NumElems; ++i) {
- int EltIdx = SVOp->getMaskElt(i);
- if ((i & Mask) != 0 && EltIdx != -1)
- return SDValue();
- if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
- return SDValue();
- }
-
- unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
- MVT NeVT = MVT::getIntegerVT(NBits);
- MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
-
- if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
- return SDValue();
-
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
-}
-
-static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
-
- if (isZeroShuffle(SVOp))
- return getZeroVector(VT, Subtarget, DAG, dl);
-
- // Handle splat operations
- if (SVOp->isSplat()) {
- // Use vbroadcast whenever the splat comes from a foldable load
- SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
- if (Broadcast.getNode())
- return Broadcast;
- }
-
- // Check integer expanding shuffles.
- SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- // If the shuffle can be profitably rewritten as a narrower shuffle, then
- // do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
- VT == MVT::v32i8) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode())
- return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
- // FIXME: Figure out a cleaner way to do this.
- if (ISD::isBuildVectorAllZeros(V2.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode()) {
- MVT NewVT = NewOp.getSimpleValueType();
- if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
- NewVT, true, false))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
- dl);
- }
- } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
- if (NewOp.getNode()) {
- MVT NewVT = NewOp.getSimpleValueType();
- if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
- return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
- dl);
- }
- }
- }
- return SDValue();
-}
-
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- MVT VT = Op.getSimpleValueType();
- SDLoc dl(Op);
- unsigned NumElems = VT.getVectorNumElements();
- bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
- bool V1IsSplat = false;
- bool V2IsSplat = false;
- bool HasSSE2 = Subtarget->hasSSE2();
- bool HasFp256 = Subtarget->hasFp256();
- bool HasInt256 = Subtarget->hasInt256();
- MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
-
- // Check if we should use the experimental vector shuffle lowering. If so,
- // delegate completely to that code path.
- if (ExperimentalVectorShuffleLowering)
- return lowerVectorShuffle(Op, Subtarget, DAG);
-
- assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
-
- if (V1IsUndef && V2IsUndef)
- return DAG.getUNDEF(VT);
-
- // When we create a shuffle node we put the UNDEF node to second operand,
- // but in some cases the first operand may be transformed to UNDEF.
- // In this case we should just commute the node.
- if (V1IsUndef)
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- // Vector shuffle lowering takes 3 steps:
- //
- // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
- // narrowing and commutation of operands should be handled.
- // 2) Matching of shuffles with known shuffle masks to x86 target specific
- // shuffle nodes.
- // 3) Rewriting of unmatched masks into new generic shuffle operations,
- // so the shuffle can be broken into other shuffles and the legalizer can
- // try the lowering again.
- //
- // The general idea is that no vector_shuffle operation should be left to
- // be matched during isel, all of them must be converted to a target specific
- // node here.
-
- // Normalize the input vectors. Here splats, zeroed vectors, profitable
- // narrowing and commutation of operands should be handled. The actual code
- // doesn't include all of those, work in progress...
- SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
-
- SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
-
- // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
- // unpckh_undef). Only use pshufd if speed is more important than size.
- if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
- V2IsUndef && MayFoldVectorLoad(V1))
- return getMOVDDup(Op, dl, V1, DAG);
-
- if (isMOVHLPS_v_undef_Mask(M, VT))
- return getMOVHighToLow(Op, dl, DAG);
-
- // Use to match splats
- if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
- (VT == MVT::v2f64 || VT == MVT::v2i64))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- if (isPSHUFDMask(M, VT)) {
- // The actual implementation will match the mask in the if above and then
- // during isel it can match several different instructions, not only pshufd
- // as its name says, sad but true, emulate the behavior for now...
- if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
- return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
-
- unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
-
- if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
- return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
-
- if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
- DAG);
-
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
- TargetMask, DAG);
- }
-
- if (isPALIGNRMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
- getShufflePALIGNRImmediate(SVOp),
- DAG);
-
- if (isVALIGNMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
- getShuffleVALIGNImmediate(SVOp),
- DAG);
-
- // Check if this can be converted into a logical shift.
- bool isLeft = false;
- unsigned ShAmt = 0;
- SDValue ShVal;
- bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
- if (isShift && ShVal.hasOneUse()) {
- // If the shifted value has multiple uses, it may be cheaper to use
- // v_set0 + movlhps or movhlps, etc.
- MVT EltVT = VT.getVectorElementType();
- ShAmt *= EltVT.getSizeInBits();
- return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
- }
-
- if (isMOVLMask(M, VT)) {
- if (ISD::isBuildVectorAllZeros(V1.getNode()))
- return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
- if (!isMOVLPMask(M, VT)) {
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-
- if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
- }
- }
-
- // FIXME: fold these into legal mask.
- if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
- return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
-
- if (isMOVHLPSMask(M, VT))
- return getMOVHighToLow(Op, dl, DAG);
-
- if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
-
- if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
-
- if (isMOVLPMask(M, VT))
- return getMOVLP(Op, dl, DAG, HasSSE2);
-
- if (ShouldXformToMOVHLPS(M, VT) ||
- ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- if (isShift) {
- // No better options. Use a vshldq / vsrldq.
- MVT EltVT = VT.getVectorElementType();
- ShAmt *= EltVT.getSizeInBits();
- return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
- }
-
- bool Commuted = false;
- // FIXME: This should also accept a bitcast of a splat? Be careful, not
- // 1,1,1,1 -> v8i16 though.
- BitVector UndefElements;
- if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
- if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
- V1IsSplat = true;
- if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
- if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
- V2IsSplat = true;
-
- // Canonicalize the splat or undef, if present, to be on the RHS.
- if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
- CommuteVectorShuffleMask(M, NumElems);
- std::swap(V1, V2);
- std::swap(V1IsSplat, V2IsSplat);
- Commuted = true;
- }
-
- if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
- // Shuffling low element of v1 into undef, just return v1.
- if (V2IsUndef)
- return V1;
- // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
- // the instruction selector will not match, so get a canonical MOVL with
- // swapped operands to undo the commute.
- return getMOVL(DAG, dl, VT, V2, V1);
- }
-
- if (isUNPCKLMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
- if (isUNPCKHMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-
- if (V2IsSplat) {
- // Normalize mask so all entries that point to V2 points to its first
- // element then try to match unpck{h|l} again. If match, return a
- // new vector_shuffle with the corrected mask.p
- SmallVector<int, 8> NewMask(M.begin(), M.end());
- NormalizeMask(NewMask, NumElems);
- if (isUNPCKLMask(NewMask, VT, HasInt256, true))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- if (isUNPCKHMask(NewMask, VT, HasInt256, true))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
-
- if (Commuted) {
- // Commute is back and try unpck* again.
- // FIXME: this seems wrong.
- CommuteVectorShuffleMask(M, NumElems);
- std::swap(V1, V2);
- std::swap(V1IsSplat, V2IsSplat);
-
- if (isUNPCKLMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-
- if (isUNPCKHMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
-
- // Normalize the node to match x86 shuffle ops if needed
- if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
- return DAG.getCommutedVectorShuffle(*SVOp);
-
- // The checks below are all present in isShuffleMaskLegal, but they are
- // inlined here right now to enable us to directly emit target specific
- // nodes, and remove one by one until they don't return Op anymore.
-
- if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
- SVOp->getSplatIndex() == 0 && V2IsUndef) {
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- }
-
- if (isPSHUFHWMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
- getShufflePSHUFHWImmediate(SVOp),
- DAG);
-
- if (isPSHUFLWMask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
- getShufflePSHUFLWImmediate(SVOp),
- DAG);
-
- unsigned MaskValue;
- if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
- &MaskValue))
- return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
-
- if (isSHUFPMask(M, VT))
- return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
- getShuffleSHUFImmediate(SVOp), DAG);
-
- if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
- if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
- return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
-
- //===--------------------------------------------------------------------===//
- // Generate target specific nodes for 128 or 256-bit shuffles only
- // supported in the AVX instruction set.
- //
-
- // Handle VMOVDDUPY permutations
- if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
- return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
-
- // Handle VPERMILPS/D* permutations
- if (isVPERMILPMask(M, VT)) {
- if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
- return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
- return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
- getShuffleSHUFImmediate(SVOp), DAG);
- }
-
- unsigned Idx;
- if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
- return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
- Idx*(NumElems/2), DAG, dl);
-
- // Handle VPERM2F128/VPERM2I128 permutations
- if (isVPERM2X128Mask(M, VT, HasFp256))
- return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
- V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
-
- if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
- return getINSERTPS(SVOp, dl, DAG);
-
- unsigned Imm8;
- if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
- return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
-
- if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
- VT.is512BitVector()) {
- MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
- MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
- SmallVector<SDValue, 16> permclMask;
- for (unsigned i = 0; i != NumElems; ++i) {
- permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
- }
-
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
- if (V2IsUndef)
- // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
- return DAG.getNode(X86ISD::VPERMV, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
- return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
- DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
- }
-
- //===--------------------------------------------------------------------===//
- // Since no target specific shuffle was selected for this generic one,
- // lower it into other known shuffles. FIXME: this isn't true yet, but
- // this is the plan.
- //
-
- // Handle v8i16 specifically since SSE can do byte extraction and insertion.
- if (VT == MVT::v8i16) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v16i8) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- if (VT == MVT::v32i8) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
- if (NewOp.getNode())
- return NewOp;
- }
-
- // Handle all 128-bit wide vectors with 4 elements, and match them with
- // several different shuffle types.
- if (NumElems == 4 && VT.is128BitVector())
- return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
-
- // Handle general 256-bit shuffles
- if (VT.is256BitVector())
- return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
-
- return SDValue();
-}
-
// This function assumes its argument is a BUILD_VECTOR of constants or
// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
// true.
@@ -12674,48 +10508,29 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
return true;
}
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
+ auto *CondBV = cast<BuildVectorSDNode>(Cond);
- // Check the mask for BLEND and build the value.
- unsigned MaskValue = 0;
- if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
- return SDValue();
-
- // Convert i32 vectors to floating point if it is not AVX2.
- // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- MVT BlendVT = VT;
- if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
- LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
- RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ SmallVector<int, 32> Mask;
+ for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+ SDValue CondElt = CondBV->getOperand(i);
+ Mask.push_back(
+ isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
}
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
- DAG.getConstant(MaskValue, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -12726,28 +10541,40 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
- SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
- if (BlendOp.getNode())
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
- // Some types for vselect were previously set to Expand, not Legal or
- // Custom. Return an empty SDValue so we fall-through to Expand, after
- // the Custom lowering phase.
- MVT VT = Op.getSimpleValueType();
- switch (VT.SimpleTy) {
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
default:
- break;
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget->hasAVX2())
+ return Op;
+
+ return SDValue();
+
case MVT::v8i16:
case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
if (Subtarget->hasBWI() && Subtarget->hasVLX())
- break;
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
return SDValue();
}
-
- // We couldn't create a "Blend with immediate" node.
- // This node should still be legal, but we'll have to emit a blendv*
- // instruction.
- return Op;
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
@@ -12769,12 +10596,11 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
// If Idx is 0, it's cheaper to do a move instead of a pextrw.
if (Idx == 0)
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getNode(ISD::BITCAST, dl,
- MVT::v4i32,
- Op.getOperand(0)),
- Op.getOperand(1)));
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+ Op.getOperand(1)));
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
@@ -12798,10 +10624,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
User->getValueType(0) != MVT::i32))
return SDValue();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
- Op.getOperand(0)),
- Op.getOperand(1));
- return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+ DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getBitcast(MVT::f32, Extract);
}
if (VT == MVT::i32 || VT == MVT::i64) {
@@ -12823,6 +10648,8 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
@@ -12836,13 +10663,15 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
- DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
- DAG.getConstant(MaxSift, MVT::i8));
+ DAG.getConstant(MaxSift, dl, MVT::i8));
return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
}
SDValue
@@ -12869,10 +10698,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
getZeroVector(MaskVT, Subtarget, DAG, dl),
- Idx, DAG.getConstant(0, getPointerTy()));
+ Idx, DAG.getConstant(0, dl, getPointerTy()));
SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
- Perm, DAG.getConstant(0, getPointerTy()));
+ Perm, DAG.getConstant(0, dl, getPointerTy()));
}
return SDValue();
}
@@ -12892,7 +10721,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// IdxVal -= NumElems/2;
IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- DAG.getConstant(IdxVal, MVT::i32));
+ DAG.getConstant(IdxVal, dl, MVT::i32));
}
assert(VecVT.is128BitVector() && "Unexpected vector length");
@@ -12911,8 +10740,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
if (Idx == 0)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getNode(ISD::BITCAST, dl,
- MVT::v4i32, Vec),
+ DAG.getBitcast(MVT::v4i32, Vec),
Op.getOperand(1)));
// Transform it so it match pextrw which produces a 32-bit result.
MVT EltVT = MVT::i32;
@@ -12934,7 +10762,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
}
if (VT.getSizeInBits() == 64) {
@@ -12953,7 +10781,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
}
return SDValue();
@@ -12982,15 +10810,11 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+ if (IdxVal)
+ EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
- DAG.getConstant(IdxVal, MVT::i8));
- const TargetRegisterClass* rc = getRegClassFor(VecVT);
- unsigned MaxSift = rc->getSize()*8 - 1;
- EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
- DAG.getConstant(MaxSift, MVT::i8));
- EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
- DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+ return EltInVec;
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
@@ -13014,17 +10838,31 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
// into that, and then insert the subvector back into the result.
if (VT.is256BitVector() || VT.is512BitVector()) {
- // Get the desired 128-bit vector half.
+ // With a 256-bit vector, we can insert into the zero element efficiently
+ // using a blend if we have AVX or AVX2 and the right data type.
+ if (VT.is256BitVector() && IdxVal == 0) {
+ // TODO: It is worthwhile to cast integer to floating point and back
+ // and incur a domain crossing penalty if that's what we'll end up
+ // doing anyway after extracting to a 128-bit vector.
+ if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
+ (Subtarget->hasAVX2() && EltVT == MVT::i32)) {
+ SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ N2 = DAG.getIntPtrConstant(1, dl);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ }
+ }
+
+ // Get the desired 128-bit vector chunk.
SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
- // Insert the element into the desired half.
+ // Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
- DAG.getConstant(IdxIn128, MVT::i32));
+ DAG.getConstant(IdxIn128, dl, MVT::i32));
- // Insert the changed part back to the 256-bit vector
+ // Insert the changed part back into the bigger vector
return Insert128BitVector(N0, V, IdxVal, DAG, dl);
}
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
@@ -13044,22 +10882,35 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if (N1.getValueType() != MVT::i32)
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal);
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
}
if (EltVT == MVT::f32) {
- // Bits [7:6] of the constant are the source select. This will always be
- // zero here. The DAG Combiner may combine an extract_elt index into
- // these
- // bits. For example (insert (extract, 3), 2) could be matched by
- // putting
- // the '3' into bits [7:6] of X86ISD::INSERTPS.
- // Bits [5:4] of the constant are the destination select. This is the
- // value of the incoming immediate.
- // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into
+ // these bits. For example (insert (extract, 3), 2) could be matched by
+ // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- N2 = DAG.getIntPtrConstant(IdxVal << 4);
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool MinSize = F->hasFnAttribute(Attribute::MinSize);
+ if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ // If this is an insertion of 32-bits into the low 32-bits of
+ // a vector, we prefer to generate a blend with immediate rather
+ // than an insertps. Blends are simpler operations in hardware and so
+ // will always have equal or better performance than insertps.
+ // But if optimizing for size and there's a load folding opportunity,
+ // generate insertps because blendps does not have a 32-bit memory
+ // operand form.
+ N2 = DAG.getIntPtrConstant(1, dl);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ }
+ N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
@@ -13080,7 +10931,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if (N1.getValueType() != MVT::i32)
N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
if (N2.getValueType() != MVT::i32)
- N2 = DAG.getIntPtrConstant(IdxVal);
+ N2 = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
}
return SDValue();
@@ -13110,8 +10961,8 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
assert(OpVT.is128BitVector() && "Expected an SSE type!");
- return DAG.getNode(ISD::BITCAST, dl, OpVT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+ return DAG.getBitcast(
+ OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
@@ -13145,25 +10996,76 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue SubVec = Op.getNode()->getOperand(1);
- SDValue Idx = Op.getNode()->getOperand(2);
-
- if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
- Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
- SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ if (!Subtarget->hasAVX())
+ return SDValue();
+
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+ !Subtarget->isUnalignedMem32Slow()) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+ if (Idx2->getZExtValue() == 0) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+ if (LD.getNode())
+ return LD;
+ }
}
+ }
- if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
- SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.getVectorElementType() == MVT::i1) {
+ if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal
+ return Op;
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+ SDValue Undef = DAG.getUNDEF(OpVT);
+ unsigned NumElems = OpVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8);
+
+ if (IdxVal == OpVT.getVectorNumElements() / 2) {
+ // Zero upper bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
+ }
+ if (IdxVal == 0) {
+ SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
+ SubVec, ZeroIdx);
+ // Zero upper bits of the Vec2
+ Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits);
+ Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits);
+ // Zero lower bits of the Vec
+ Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits);
+ // Merge them together
+ return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2);
}
}
return SDValue();
@@ -13356,7 +11258,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
// addition for it.
if (Offset != 0)
Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
- DAG.getConstant(Offset, getPointerTy()));
+ DAG.getConstant(Offset, dl, getPointerTy()));
return Result;
}
@@ -13471,7 +11373,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
is64Bit ? 257 : 256));
SDValue ThreadPointer =
- DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+ DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
MachinePointerInfo(Ptr), false, false, false, 0);
unsigned char OperandFlags = 0;
@@ -13523,7 +11425,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->isTargetELF()) {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
-
switch (model) {
case TLSModel::GeneralDynamic:
if (Subtarget->is64Bit())
@@ -13613,30 +11514,36 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
SDValue TlsArray =
Subtarget->is64Bit()
- ? DAG.getIntPtrConstant(0x58)
+ ? DAG.getIntPtrConstant(0x58, dl)
: (Subtarget->isTargetWindowsGNU()
- ? DAG.getIntPtrConstant(0x2C)
+ ? DAG.getIntPtrConstant(0x2C, dl)
: DAG.getExternalSymbol("_tls_array", getPointerTy()));
SDValue ThreadPointer =
DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
MachinePointerInfo(Ptr), false, false, false, 0);
- // Load the _tls_index variable
- SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
- if (Subtarget->is64Bit())
- IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
- IDX, MachinePointerInfo(), MVT::i32,
- false, false, false, 0);
- else
- IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
- false, false, false, 0);
+ SDValue res;
+ if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
+ res = ThreadPointer;
+ } else {
+ // Load the _tls_index variable
+ SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+ if (Subtarget->is64Bit())
+ IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX,
+ MachinePointerInfo(), MVT::i32, false, false,
+ false, 0);
+ else
+ IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+ false, false, false, 0);
- SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
- getPointerTy());
- IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+ SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), dl,
+ getPointerTy());
+ IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+ res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+ }
- SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
false, false, false, 0);
@@ -13669,10 +11576,10 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
// during isel.
SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
- DAG.getConstant(VTBits - 1, MVT::i8));
+ DAG.getConstant(VTBits - 1, dl, MVT::i8));
SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
- DAG.getConstant(VTBits - 1, MVT::i8))
- : DAG.getConstant(0, VT);
+ DAG.getConstant(VTBits - 1, dl, MVT::i8))
+ : DAG.getConstant(0, dl, VT);
SDValue Tmp2, Tmp3;
if (Op.getOpcode() == ISD::SHL_PARTS) {
@@ -13687,12 +11594,12 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
// rely on the results of shld/shrd. Insert a test and select the appropriate
// values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
- DAG.getConstant(VTBits, MVT::i8));
+ DAG.getConstant(VTBits, dl, MVT::i8));
SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- AndNode, DAG.getConstant(0, MVT::i8));
+ AndNode, DAG.getConstant(0, dl, MVT::i8));
SDValue Hi, Lo;
- SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
@@ -13847,14 +11754,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
- SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
- CLod0);
+ SDValue Unpck1 =
+ getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
- SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+ SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
@@ -13862,16 +11768,15 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
- SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+ SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
S2F, 0x4E, DAG);
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
- Sub);
+ DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
}
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
}
// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
@@ -13879,7 +11784,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
// FP constant to bias correct the final result.
- SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
MVT::f64);
// Load the 32-bit value into an XMM register.
@@ -13890,20 +11795,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0, dl));
// Or the load with the bias.
- SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- MVT::v2f64, Load)),
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- MVT::v2f64, Bias)));
- Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
- DAG.getIntPtrConstant(0));
+ SDValue Or = DAG.getNode(
+ ISD::OR, dl, MVT::v2i64,
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+ DAG.getBitcast(MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
+ Or =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
// Subtract the bias.
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
@@ -13913,7 +11817,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
if (DestVT.bitsLT(MVT::f64))
return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
if (DestVT.bitsGT(MVT::f64))
return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
@@ -13958,20 +11862,20 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// -- v >> 16
// Create the splat vector for 0x4b000000.
- SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
+ SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32);
SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
CstLow, CstLow, CstLow, CstLow};
SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
makeArrayRef(&CstLowArray[0], NumElts));
// Create the splat vector for 0x53000000.
- SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
+ SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32);
SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
CstHigh, CstHigh, CstHigh, CstHigh};
SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
makeArrayRef(&CstHighArray[0], NumElts));
// Create the right shift.
- SDValue CstShift = DAG.getConstant(16, MVT::i32);
+ SDValue CstShift = DAG.getConstant(16, DL, MVT::i32);
SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
CstShift, CstShift, CstShift, CstShift};
SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
@@ -13982,25 +11886,22 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
if (Subtarget.hasSSE41()) {
EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
- SDValue VecCstLowBitcast =
- DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
- SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
+ SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
+ SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
- VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
+ VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
- SDValue VecCstHighBitcast =
- DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
- SDValue VecShiftBitcast =
- DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
+ SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
+ SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
- VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
+ VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
} else {
- SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
+ SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32);
SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
CstMask, CstMask, CstMask);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -14013,18 +11914,18 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
SDValue CstFAdd = DAG.getConstantFP(
- APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
+ APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32);
SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
CstFAdd, CstFAdd, CstFAdd, CstFAdd};
SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
makeArrayRef(&CstFAddArray[0], NumElts));
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
- SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
+ SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
SDValue FHigh =
DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
// return (float4) lo + fhi;
- SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
+ SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
}
@@ -14048,6 +11949,11 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
case MVT::v4i32:
case MVT::v8i32:
return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+ case MVT::v16i8:
+ case MVT::v16i16:
+ if (Subtarget->hasAVX512())
+ return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
}
llvm_unreachable(nullptr);
}
@@ -14078,13 +11984,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
if (SrcVT == MVT::i32) {
- SDValue WordOff = DAG.getConstant(4, getPointerTy());
+ SDValue WordOff = DAG.getConstant(4, dl, getPointerTy());
SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
getPointerTy(), StackSlot, WordOff);
SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
StackSlot, MachinePointerInfo(),
false, false, 0);
- SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
OffsetSlot, MachinePointerInfo(),
false, false, 0);
SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
@@ -14116,8 +12022,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Check whether the sign bit is set.
SDValue SignSet = DAG.getSetCC(dl,
getSetCCResultType(*DAG.getContext(), MVT::i64),
- Op.getOperand(0), DAG.getConstant(0, MVT::i64),
- ISD::SETLT);
+ Op.getOperand(0),
+ DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
SDValue FudgePtr = DAG.getConstantPool(
@@ -14125,8 +12031,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
getPointerTy());
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
- SDValue Zero = DAG.getIntPtrConstant(0);
- SDValue Four = DAG.getIntPtrConstant(4);
+ SDValue Zero = DAG.getIntPtrConstant(0, dl);
+ SDValue Four = DAG.getIntPtrConstant(4, dl);
SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
Zero, Four);
FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
@@ -14138,7 +12044,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MVT::f32, false, false, false, 4);
// Extend everything to 80 bits to force it to be done on x87.
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
- return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
+ DAG.getIntPtrConstant(0, dl));
}
std::pair<SDValue,SDValue>
@@ -14241,6 +12148,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
+ if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1)
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+
// Optimize vectors in AVX mode:
//
// v8i16 -> v8i32
@@ -14271,41 +12181,36 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements()/2);
- OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+ OpLo = DAG.getBitcast(HVT, OpLo);
+ OpHi = DAG.getBitcast(HVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
- SelectionDAG &DAG) {
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
SDLoc DL(Op);
unsigned int NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16)
+ if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
return SDValue();
if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
- EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // Now we have only mask extension
assert(InVT.getVectorElementType() == MVT::i1);
- SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
- const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
- SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
-
- SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+ MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+ SDValue One =
+ DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
+ SDValue Zero =
+ DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+
+ SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
if (VT.is512BitVector())
- return Brcst;
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+ return V;
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, V);
}
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
@@ -14327,7 +12232,7 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
MVT SVT = In.getSimpleValueType();
if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
- return LowerZERO_EXTEND_AVX512(Op, DAG);
+ return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
if (Subtarget->hasFp256()) {
SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
@@ -14357,6 +12262,23 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
+ // move vector to mask - truncate solution for SKX
+ if (VT.getVectorElementType() == MVT::i1) {
+ if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 &&
+ Subtarget->hasBWI())
+ return Op; // legal, will go to VPMOVB2M, VPMOVW2M
+ if ((InVT.is256BitVector() || InVT.is128BitVector())
+ && InVT.getScalarSizeInBits() <= 16 &&
+ Subtarget->hasBWI() && Subtarget->hasVLX())
+ return Op; // legal, will go to VPMOVB2M, VPMOVW2M
+ if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 &&
+ Subtarget->hasDQI())
+ return Op; // legal, will go to VPMOVD2M, VPMOVQ2M
+ if ((InVT.is256BitVector() || InVT.is128BitVector())
+ && InVT.getScalarSizeInBits() >= 32 &&
+ Subtarget->hasDQI() && Subtarget->hasVLX())
+ return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
+ }
if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
if (VT.getVectorElementType().getSizeInBits() >=8)
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
@@ -14370,14 +12292,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
InVT = ExtVT;
}
- SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
- const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
- SDValue CP = DAG.getConstantPool(C, getPointerTy());
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
- SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+ SDValue OneV =
+ DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT);
SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
}
@@ -14386,19 +12302,19 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget->hasInt256()) {
static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
- In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+ In = DAG.getBitcast(MVT::v8i32, In);
In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
ShufMask);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, DL));
}
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(2));
- OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+ DAG.getIntPtrConstant(2, DL));
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
static const int ShufMask[] = {0, 2, 4, 6};
return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
}
@@ -14406,41 +12322,41 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
// On AVX2, v8i32 -> v8i16 becomed PSHUFB.
if (Subtarget->hasInt256()) {
- In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+ In = DAG.getBitcast(MVT::v32i8, In);
SmallVector<SDValue,32> pshufbMask;
for (unsigned i = 0; i < 2; ++i) {
- pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
for (unsigned j = 0; j < 8; ++j)
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
}
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
- In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+ In = DAG.getBitcast(MVT::v4i64, In);
static const int ShufMask[] = {0, 2, -1, -1};
In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
&ShufMask[0]);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
- DAG.getIntPtrConstant(0));
- return DAG.getNode(ISD::BITCAST, DL, VT, In);
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getBitcast(VT, In);
}
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, DL));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
- DAG.getIntPtrConstant(4));
+ DAG.getIntPtrConstant(4, DL));
- OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+ OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
+ OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
// The PSHUFB mask:
static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
@@ -14450,13 +12366,13 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
- OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+ OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
+ OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
// The MOVLHPS Mask:
static const int ShufMask2[] = {0, 1, 4, 5};
SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+ return DAG.getBitcast(MVT::v8i16, res);
}
// Handle truncation of V256 to V128 using shuffles.
@@ -14472,11 +12388,10 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// Prepare truncation shuffle mask
for (unsigned i = 0; i != NumElems; ++i)
MaskVec[i] = i * 2;
- SDValue V = DAG.getVectorShuffle(NVT, DL,
- DAG.getNode(ISD::BITCAST, DL, NVT, In),
+ SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
DAG.getUNDEF(NVT), &MaskVec[0]);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, DL));
}
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
@@ -14582,13 +12497,12 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
// For a vector, cast operands to a vector type, perform the logic op,
// and cast the result back to the original value type.
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
- SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
- SDValue Operand = IsFNABS ?
- DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
- DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+ SDValue MaskCasted = DAG.getBitcast(VecVT, Mask);
+ SDValue Operand = IsFNABS ? DAG.getBitcast(VecVT, Op0.getOperand(0))
+ : DAG.getBitcast(VecVT, Op0);
unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
+ return DAG.getBitcast(VT,
+ DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
}
// If not vector, then scalar.
@@ -14613,7 +12527,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
}
// And if it is bigger, shrink it first.
if (SrcVT.bitsGT(VT)) {
- Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+ Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
SrcVT = VT;
}
@@ -14672,8 +12586,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
// Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
- DAG.getConstant(1, VT));
- return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+ DAG.getConstant(1, dl, VT));
+ return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT));
}
// Check whether an OR'd tree is PTEST-able.
@@ -14753,7 +12667,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
// Cast all vectors into TestVT for PTEST.
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
- VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+ VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
// If more than one full vectors are evaluated, OR them first before PTEST.
for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
@@ -14791,11 +12705,11 @@ static bool hasNonFlagsUse(SDValue Op) {
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- // KORTEST instruction should be selected
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
-
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, dl, MVT::i8));
+ }
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
@@ -14817,9 +12731,8 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
case ISD::SUB:
case ISD::MUL:
case ISD::SHL: {
- const BinaryWithFlagsSDNode *BinNode =
- cast<BinaryWithFlagsSDNode>(Op.getNode());
- if (BinNode->hasNoSignedWrap())
+ const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
+ if (BinNode->Flags.hasNoSignedWrap())
break;
}
default:
@@ -14838,7 +12751,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
// return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
// DAG.getConstant(0, MVT::i1));
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
+ DAG.getConstant(0, dl, Op.getValueType()));
}
unsigned Opcode = 0;
unsigned NumOperands = 0;
@@ -14926,7 +12839,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
if (!Mask.isSignedIntN(32)) // Avoid large immediates.
break;
SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
- DAG.getConstant(Mask, VT));
+ DAG.getConstant(Mask, dl, VT));
DAG.ReplaceAllUsesWith(Op, New);
Op = New;
}
@@ -15012,12 +12925,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
if (Opcode == 0)
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
+ DAG.getConstant(0, dl, Op.getValueType()));
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SmallVector<SDValue, 4> Ops;
- for (unsigned i = 0; i != NumOperands; ++i)
- Ops.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
@@ -15043,8 +12954,8 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
// if we're optimizing for size, however, as that'll allow better folding
// of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
- !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::MinSize) &&
+ !DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::MinSize) &&
!Subtarget->isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -15079,7 +12990,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
- DAG.getConstant(8, MVT::i8));
+ DAG.getConstant(8, dl, MVT::i8));
SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
@@ -15090,29 +13001,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps,
bool &UseOneConstNR) const {
- // FIXME: We should use instruction latency models to calculate the cost of
- // each potential sequence, but this is very hard to do reliably because
- // at least Intel's Core* chips have variable timing based on the number of
- // significant digits in the divisor and/or sqrt operand.
- if (!Subtarget->useSqrtEst())
- return SDValue();
-
EVT VT = Op.getValueType();
+ const char *RecipOp;
- // SSE1 has rsqrtss and rsqrtps.
+ // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget->hasAVX() && VT == MVT::v8f32)) {
- RefinementSteps = 1;
- UseOneConstNR = false;
- return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
- }
- return SDValue();
+ if (VT == MVT::f32 && Subtarget->hasSSE1())
+ RecipOp = "sqrtf";
+ else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ RecipOp = "vec-sqrtf";
+ else
+ return SDValue();
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ RefinementSteps = Recips.getRefinementSteps(RecipOp);
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
}
/// The minimum architected relative accuracy is 2^-12. We need one
@@ -15120,15 +13033,9 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
DAGCombinerInfo &DCI,
unsigned &RefinementSteps) const {
- // FIXME: We should use instruction latency models to calculate the cost of
- // each potential sequence, but this is very hard to do reliably because
- // at least Intel's Core* chips have variable timing based on the number of
- // significant digits in the divisor.
- if (!Subtarget->useReciprocalEst())
- return SDValue();
-
EVT VT = Op.getValueType();
-
+ const char *RecipOp;
+
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
// TODO: Add support for AVX512 (v16f32).
// It is likely not profitable to do this for f64 because a double-precision
@@ -15136,12 +13043,30 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
// 15 instructions: convert to single, rcpss, convert back to double, refine
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
- if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
- (Subtarget->hasAVX() && VT == MVT::v8f32)) {
- RefinementSteps = ReciprocalEstimateRefinementSteps;
- return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
- }
- return SDValue();
+ if (VT == MVT::f32 && Subtarget->hasSSE1())
+ RecipOp = "divf";
+ else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
+ (VT == MVT::v8f32 && Subtarget->hasAVX()))
+ RecipOp = "vec-divf";
+ else
+ return SDValue();
+
+ TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+ if (!Recips.isEnabled(RecipOp))
+ return SDValue();
+
+ RefinementSteps = Recips.getRefinementSteps(RecipOp);
+ return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+}
+
+/// If we have at least two divisions that use the same divisor, convert to
+/// multplication by a reciprocal. This may need to be adjusted for a given
+/// CPU if a division's cost is not at least twice the cost of a multiplication.
+/// This is because we still need one division to calculate the reciprocal and
+/// then we need two multiplies by that reciprocal as replacements for the
+/// original divisions.
+bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+ return NumUsers > 1;
}
static bool isAllOnes(SDValue V) {
@@ -15192,7 +13117,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
// Use BT if the immediate can't be encoded in a TEST instruction.
if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
LHS = AndLHS;
- RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+ RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
}
}
@@ -15214,7 +13139,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(Cond, MVT::i8), BT);
+ DAG.getConstant(Cond, dl, MVT::i8), BT);
}
return SDValue();
@@ -15295,6 +13220,49 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
+static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ MVT VT = Op.getSimpleValueType();
+ SDLoc dl(Op);
+
+ assert(Op0.getValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected type for boolean compare operation");
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
+ DAG.getConstant(-1, dl, VT));
+ SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
+ DAG.getConstant(-1, dl, VT));
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETNE:
+ // (x != y) -> ~(x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT,
+ DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
+ DAG.getConstant(-1, dl, VT));
+ case ISD::SETEQ:
+ // (x == y) -> (x ^ y)
+ return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ // (x > y) -> (x & ~y)
+ return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
+ case ISD::SETULT:
+ case ISD::SETLT:
+ // (x < y) -> (~x & y)
+ return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
+ case ISD::SETULE:
+ case ISD::SETLE:
+ // (x <= y) -> (~x | y)
+ return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
+ case ISD::SETUGE:
+ case ISD::SETGE:
+ // (x >=y) -> (x | ~y)
+ return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
+ }
+}
+
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDValue Op0 = Op.getOperand(0);
@@ -15332,7 +13300,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(Opc, dl, VT, Op0, Op1);
Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, MVT::i8));
+ DAG.getConstant(SSECC, dl, MVT::i8));
}
/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
@@ -15359,7 +13327,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
if (Val == 0)
return SDValue();
- ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
+ ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
}
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
@@ -15399,22 +13367,25 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC0, MVT::i8));
+ DAG.getConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC1, MVT::i8));
+ DAG.getConstant(CC1, dl, MVT::i8));
return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
}
// Handle all other FP comparisons here.
return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, MVT::i8));
+ DAG.getConstant(SSECC, dl, MVT::i8));
}
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntVSETCC(Op, DAG);
- bool MaskResult = (VT.getVectorElementType() == MVT::i1);
EVT OpVT = Op1.getValueType();
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return LowerBoolVSETCC_AVX512(Op, DAG);
+
+ bool MaskResult = (VT.getVectorElementType() == MVT::i1);
if (Subtarget->hasAVX512()) {
if (Op1.getValueType().is512BitVector() ||
(Subtarget->hasBWI() && Subtarget->hasVLX()) ||
@@ -15516,18 +13487,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
assert(Subtarget->hasSSE2() && "Don't know how to lower!");
// First cast everything to the right type.
- Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
- Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
SDValue SB;
if (FlipSigns) {
- SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+ SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
} else {
- SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
- SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+ SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
Sign, Zero, Sign, Zero);
}
@@ -15551,7 +13522,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
- return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ return DAG.getBitcast(VT, Result);
}
if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
@@ -15560,8 +13531,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
// First cast everything to the right type.
- Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
- Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
// Do the compare.
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
@@ -15574,7 +13545,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
if (Invert)
Result = DAG.getNOT(dl, Result, MVT::v4i32);
- return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ return DAG.getBitcast(VT, Result);
}
}
@@ -15582,7 +13553,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// bits of the inputs before performing those operations.
if (FlipSigns) {
EVT EltVT = VT.getVectorElementType();
- SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+ VT);
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
}
@@ -15650,7 +13622,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
CCode = X86::GetOppositeBranchCondition(CCode);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(CCode, MVT::i8),
+ DAG.getConstant(CCode, dl, MVT::i8),
Op0.getOperand(1));
if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
@@ -15662,18 +13634,18 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
- return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
+ return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
}
bool isFP = Op1.getSimpleValueType().isFloatingPoint();
- unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+ unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
if (X86CC == X86::COND_INVALID)
return SDValue();
SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+ DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
if (VT == MVT::i1)
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
return SetCC;
@@ -15724,9 +13696,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op1.getValueType();
SDValue CC;
- // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
- // are available. Otherwise fp cmovs get lowered into a less efficient branch
- // sequence later on.
+ // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+ // are available or VBLENDV if AVX is available.
+ // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
if (Cond.getOpcode() == ISD::SETCC &&
((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
(Subtarget->hasSSE1() && VT == MVT::f32)) &&
@@ -15738,17 +13710,85 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (SSECC != 8) {
if (Subtarget->hasAVX512()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
- DAG.getConstant(SSECC, MVT::i8));
+ DAG.getConstant(SSECC, DL, MVT::i8));
return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
}
+
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
- DAG.getConstant(SSECC, MVT::i8));
+ DAG.getConstant(SSECC, DL, MVT::i8));
+
+ // If we have AVX, we can use a variable vector select (VBLENDV) instead
+ // of 3 logic instructions for size savings and potentially speed.
+ // Unfortunately, there is no scalar form of VBLENDV.
+
+ // If either operand is a constant, don't try this. We can expect to
+ // optimize away at least one of the logic instructions later in that
+ // case, so that sequence would be faster than a variable blend.
+
+ // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
+ // uses XMM0 as the selection register. That may need just as many
+ // instructions as the AND/ANDN/OR sequence due to register moves, so
+ // don't bother.
+
+ if (Subtarget->hasAVX() &&
+ !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
+
+ // Convert to vectors, do a VSELECT, and convert back to scalar.
+ // All of the conversions should be optimized away.
+
+ EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
+ SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
+ SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
+ SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
+
+ EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
+ VCmp = DAG.getBitcast(VCmpVT, VCmp);
+
+ SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ VSel, DAG.getIntPtrConstant(0, DL));
+ }
SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
}
}
+ if (VT.isVector() && VT.getScalarType() == MVT::i1) {
+ SDValue Op1Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
+ Op1Scalar = ConvertI1VectorToInterger(Op1, DAG);
+ else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
+ Op1Scalar = Op1.getOperand(0);
+ SDValue Op2Scalar;
+ if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
+ Op2Scalar = ConvertI1VectorToInterger(Op2, DAG);
+ else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
+ Op2Scalar = Op2.getOperand(0);
+ if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
+ Op1Scalar.getValueType(),
+ Cond, Op1Scalar, Op2Scalar);
+ if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
+ return DAG.getBitcast(VT, newSelect);
+ SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
+ if (VT == MVT::v4i1 || VT == MVT::v2i1) {
+ SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
+ Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
+ Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
+ SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
+ Cond, Op1, Op2);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
+ }
+
if (Cond.getOpcode() == ISD::SETCC) {
SDValue NewCond = LowerSETCC(Cond, DAG);
if (NewCond.getNode())
@@ -15779,21 +13819,22 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
(isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
- DAG.getConstant(0, CmpOp0.getValueType()),
+ DAG.getConstant(0, DL,
+ CmpOp0.getValueType()),
CmpOp0);
SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, MVT::i8),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
SDValue(Neg.getNode(), 1));
return Res;
}
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
- CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+ CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
SDValue Res = // Res = 0 or -1.
DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
if (isAllOnes(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
@@ -15865,7 +13906,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
else
Cond = X86Op.getValue(1);
- CC = DAG.getConstant(X86Cond, MVT::i8);
+ CC = DAG.getConstant(X86Cond, DL, MVT::i8);
addTest = false;
}
@@ -15887,7 +13928,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (addTest) {
- CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
}
@@ -15902,7 +13943,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, MVT::i8), Cond);
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ Cond);
if (isAllOnes(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
@@ -15931,7 +13973,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
}
-static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
@@ -15957,7 +14000,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
unsigned int NumElts = VT.getVectorNumElements();
- if (NumElts != 8 && NumElts != 16)
+ if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI())
return SDValue();
if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
@@ -15966,22 +14009,74 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
+ SDValue NegOne =
+ DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
+ ExtVT);
+ SDValue Zero =
+ DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+
+ SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (VT.is512BitVector())
+ return V;
+ return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+}
+
+static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT VT = Op->getSimpleValueType(0);
+ MVT InVT = In.getSimpleValueType();
+ assert(VT.getSizeInBits() == InVT.getSizeInBits());
- MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
- Constant *C = ConstantInt::get(*DAG.getContext(),
- APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+ MVT InSVT = InVT.getScalarType();
+ assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits());
- SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
- SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
- if (VT.is512BitVector())
- return Brcst;
- return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+ if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
+ return SDValue();
+
+ SDLoc dl(Op);
+
+ // SSE41 targets can use the pmovsx* instructions directly.
+ if (Subtarget->hasSSE41())
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+ // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
+ SDValue Curr = In;
+ MVT CurrVT = InVT;
+
+ // As SRAI is only available on i16/i32 types, we expand only up to i32
+ // and handle i64 separately.
+ while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) {
+ Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+ MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
+ CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
+ Curr = DAG.getBitcast(CurrVT, Curr);
+ }
+
+ SDValue SignExt = Curr;
+ if (CurrVT != InVT) {
+ unsigned SignExtShift =
+ CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits();
+ SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(SignExtShift, dl, MVT::i8));
+ }
+
+ if (CurrVT == VT)
+ return SignExt;
+
+ if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
+ SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
+ DAG.getConstant(31, dl, MVT::i8));
+ SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
+ return DAG.getBitcast(VT, Ext);
+ }
+
+ return SDValue();
}
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
@@ -16039,6 +14134,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
@@ -16137,8 +14233,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
"Can only lower sext loads with a single scalar load!");
unsigned loadRegZize = RegSz;
- if (Ext == ISD::SEXTLOAD && RegSz == 256)
- loadRegZize /= 2;
+ if (Ext == ISD::SEXTLOAD && RegSz >= 256)
+ loadRegZize = 128;
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
@@ -16161,7 +14257,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
SmallVector<SDValue, 8> Chains;
SDValue Ptr = Ld->getBasePtr();
SDValue Increment =
- DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
+ DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, TLI.getPointerTy());
SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
for (unsigned i = 0; i < NumLoads; ++i) {
@@ -16177,7 +14273,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
else
Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
- ScalarLoad, DAG.getIntPtrConstant(i));
+ ScalarLoad, DAG.getIntPtrConstant(i, dl));
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
}
@@ -16186,7 +14282,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
- SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+ SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
unsigned SizeRatio = RegSz / MemSz;
if (Ext == ISD::SEXTLOAD) {
@@ -16211,13 +14307,14 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
SDValue Shuff = DAG.getVectorShuffle(
WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
- Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+ Shuff = DAG.getBitcast(RegVT, Shuff);
// Build the arithmetic shift.
unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
MemVT.getVectorElementType().getSizeInBits();
Shuff =
- DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
+ DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+ DAG.getConstant(Amt, dl, RegVT));
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Shuff;
@@ -16232,7 +14329,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
// Bitcast to the requested type.
- Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+ Shuff = DAG.getBitcast(RegVT, Shuff);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Shuff;
}
@@ -16384,7 +14481,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
else
Cond = X86Op.getValue(1);
- CC = DAG.getConstant(X86Cond, MVT::i8);
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
addTest = false;
} else {
unsigned CondOpc;
@@ -16415,7 +14512,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, MVT::i8);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
@@ -16433,7 +14530,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, MVT::i8);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -16446,7 +14543,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, MVT::i8);
+ CC = DAG.getConstant(CCode, dl, MVT::i8);
Cond = Cond.getOperand(0).getOperand(1);
addTest = false;
} else if (Cond.getOpcode() == ISD::SETCC &&
@@ -16472,10 +14569,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_P, MVT::i8);
+ CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -16502,10 +14599,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+ CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
Cond = Cmp;
addTest = false;
Dest = FalseBB;
@@ -16533,7 +14630,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
- CC = DAG.getConstant(X86Cond, MVT::i8);
+ CC = DAG.getConstant(X86Cond, dl, MVT::i8);
Cond = EmitTest(Cond, X86Cond, dl, DAG);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
@@ -16570,23 +14667,23 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true),
SDLoc(Node));
SDValue Size = Tmp2.getOperand(1);
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
- DAG.getConstant(-(uint64_t)Align, VT));
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
- Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
- DAG.getIntPtrConstant(0, true), SDValue(),
+ Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+ DAG.getIntPtrConstant(0, dl, true), SDValue(),
SDLoc(Node));
SDValue Ops[2] = { Tmp1, Tmp2 };
@@ -16635,15 +14732,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
if (Align) {
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, VT));
+ DAG.getConstant(-(uint64_t)Align, dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
}
@@ -16678,22 +14774,22 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
// Store gp_offset
SDValue Store = DAG.getStore(Op.getOperand(0), DL,
DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
- MVT::i32),
+ DL, MVT::i32),
FIN, MachinePointerInfo(SV), false, false, 0);
MemOps.push_back(Store);
// Store fp_offset
FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
- FIN, DAG.getIntPtrConstant(4));
+ FIN, DAG.getIntPtrConstant(4, DL));
Store = DAG.getStore(Op.getOperand(0), DL,
- DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
MVT::i32),
FIN, MachinePointerInfo(SV, 4), false, false, 0);
MemOps.push_back(Store);
// Store ptr to overflow_arg_area
FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
- FIN, DAG.getIntPtrConstant(4));
+ FIN, DAG.getIntPtrConstant(4, DL));
SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
getPointerTy());
Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
@@ -16703,7 +14799,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
// Store ptr to reg_save_area.
FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
- FIN, DAG.getIntPtrConstant(8));
+ FIN, DAG.getIntPtrConstant(8, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
getPointerTy());
Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
@@ -16745,22 +14841,17 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
- assert(!DAG.getTarget().Options.UseSoftFloat &&
- !(DAG.getMachineFunction()
- .getFunction()->getAttributes()
- .hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) &&
+ assert(!Subtarget->useSoftFloat() &&
+ !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
// Insert VAARG_64 node into the DAG
// VAARG_64 returns two values: Variable Argument Address, Chain
- SmallVector<SDValue, 11> InstOps;
- InstOps.push_back(Chain);
- InstOps.push_back(SrcPtr);
- InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
- InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
- InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+ SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
+ DAG.getConstant(ArgMode, dl, MVT::i8),
+ DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
VTs, InstOps, MVT::i64,
@@ -16791,8 +14882,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
SDLoc DL(Op);
return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
- DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
- false,
+ DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
+ false, false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
@@ -16812,7 +14903,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
if (Opc == X86ISD::VSRAI)
ShiftAmt = ElementType.getSizeInBits() - 1;
else
- return DAG.getConstant(0, VT);
+ return DAG.getConstant(0, dl, VT);
}
assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
@@ -16837,7 +14928,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
}
ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
+ Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRLI:
@@ -16849,7 +14940,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
}
ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
+ Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
}
break;
case X86ISD::VSRAI:
@@ -16861,7 +14952,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
}
ND = cast<ConstantSDNode>(CurrentOp);
const APInt &C = ND->getAPIntValue();
- Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
+ Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
}
break;
}
@@ -16869,7 +14960,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
}
- return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, dl, MVT::i8));
}
// getTargetVShiftNode - Handle vector element shifts where the shift amount
@@ -16894,7 +14986,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
}
const X86Subtarget &Subtarget =
- DAG.getTarget().getSubtarget<X86Subtarget>();
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
// Let the shuffle legalizer expand this shift amount node.
@@ -16907,7 +14999,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
SmallVector<SDValue, 4> ShOps;
ShOps.push_back(ShAmt);
if (SVT == MVT::i32) {
- ShOps.push_back(DAG.getConstant(0, SVT));
+ ShOps.push_back(DAG.getConstant(0, dl, SVT));
ShOps.push_back(DAG.getUNDEF(SVT));
}
ShOps.push_back(DAG.getUNDEF(SVT));
@@ -16921,7 +15013,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
MVT EltVT = VT.getVectorElementType();
EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
- ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+ ShAmt = DAG.getBitcast(ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
@@ -16947,8 +15039,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
switch (Op.getOpcode()) {
default: break;
@@ -16987,54 +15079,6 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- return X86ISD::FMADD;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- return X86ISD::FMSUB;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- return X86ISD::FNMADD;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- return X86ISD::FNMSUB;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- return X86ISD::FMADDSUB;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
- return X86ISD::FMSUBADD;
- }
-}
-
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
@@ -17053,27 +15097,123 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Op.getOperand(2), Op.getOperand(3));
case INTR_TYPE_1OP_MASK_RM: {
SDValue Src = Op.getOperand(1);
- SDValue Src0 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
- SDValue RoundingMode = Op.getOperand(4);
+ SDValue RoundingMode;
+ if (Op.getNumOperands() == 4)
+ RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
+ else
+ RoundingMode = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(), Src, RoundingMode),
+ Mask, PassThru, Subtarget, DAG);
+ }
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
RoundingMode),
- Mask, Src0, Subtarget, DAG);
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_1OP_MASK: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Passthru = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, Passthru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_RM: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- SDValue RoundingMode = Op.getOperand(5);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode),
+ RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
- Op.getOperand(2)),
- Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1,Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case INTR_TYPE_3OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, Src1, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, Src1, Subtarget, DAG);
}
case CMP_MASK:
case CMP_MASK_CC: {
@@ -17094,30 +15234,46 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask.getValueType().getSizeInBits());
SDValue Cmp;
if (IntrData->Type == CMP_MASK_CC) {
- Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3));
+ SDValue CC = Op.getOperand(3);
+ CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), CC);
+
} else {
assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2));
+ Op.getOperand(2));
}
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
- DAG.getTargetConstant(0, MaskVT),
+ DAG.getTargetConstant(0, dl,
+ MaskVT),
Subtarget, DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), CmpMask,
- DAG.getIntPtrConstant(0));
- return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
}
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+ unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG);
assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8), Cond);
+ DAG.getConstant(X86CC, dl, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case VSHIFT:
@@ -17143,8 +15299,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask.getValueType().getSizeInBits());
SDLoc dl(Op);
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
PassThru);
@@ -17158,21 +15314,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
Mask.getValueType().getSizeInBits());
SDLoc dl(Op);
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
Op.getOperand(2));
}
- case FMA_OP_MASK:
- {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- }
default:
break;
}
@@ -17188,16 +15334,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
- case Intrinsic::x86_avx512_mask_valign_q_512:
- case Intrinsic::x86_avx512_mask_valign_d_512:
- // Vector source operands are swapped.
- return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
- Op.getValueType(), Op.getOperand(2),
- Op.getOperand(1),
- Op.getOperand(3)),
- Op.getOperand(5), Op.getOperand(4),
- Subtarget, DAG);
-
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
@@ -17259,16 +15395,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDValue RHS = Op.getOperand(2);
unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
- SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
case Intrinsic::x86_avx512_kortestz_w:
case Intrinsic::x86_avx512_kortestc_w: {
unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
- SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
- SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
- SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+ SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+ SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
@@ -17333,7 +15469,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8),
+ DAG.getConstant(X86CC, dl, MVT::i8),
SDValue(PCMP.getNode(), 1));
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
@@ -17351,57 +15487,22 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
- auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
- if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- else
- return SDValue();
- }
+ case Intrinsic::x86_seh_lsda: {
+ // Compute the symbol for the LSDA. We know it'll get emitted later.
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Op1 = Op.getOperand(1);
+ auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
+ MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
+ GlobalValue::getRealLinkageName(Fn->getName()));
+ StringRef Name = LSDASym->getName();
+ assert(Name.data()[Name.size()] == '\0' && "not null terminated");
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ // Generate a simple absolute symbol reference. This intrinsic is only
+ // supported on 32-bit Windows, which isn't PIC.
+ SDValue Result =
+ DAG.getTargetExternalSymbol(Name.data(), VT, X86II::MO_NOPREFIX);
+ return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
+ }
}
}
@@ -17412,17 +15513,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
- MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else
- MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ MaskInReg = DAG.getBitcast(MaskVT, Mask);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
if (Src.getOpcode() == ISD::UNDEF)
Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
@@ -17438,17 +15539,17 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
- MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else
- MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ MaskInReg = DAG.getBitcast(MaskVT, Mask);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -17461,17 +15562,17 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT =
MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue MaskInReg;
ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
if (MaskC)
- MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT);
else
- MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ MaskInReg = DAG.getBitcast(MaskVT, Mask);
//SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -17510,7 +15611,7 @@ static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
// The EAX register is loaded with the low-order 32 bits. The EDX register
// is loaded with the supported high-order bits of the counter.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, MVT::i8));
+ DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
return;
@@ -17564,7 +15665,7 @@ static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
// The EDX register is loaded with the high-order 32 bits of the MSR, and
// the EAX register is loaded with the low-order 32 bits.
SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
- DAG.getConstant(32, MVT::i8));
+ DAG.getConstant(32, DL, MVT::i8));
Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
Results.push_back(Chain);
return;
@@ -17609,8 +15710,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
- DAG.getConstant(1, Op->getValueType(1)),
- DAG.getConstant(X86::COND_B, MVT::i32),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, dl, MVT::i32),
SDValue(Result.getNode(), 1) };
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
DAG.getVTList(Op->getValueType(1), MVT::Glue),
@@ -17628,8 +15729,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
- Subtarget);
+ return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
+ Chain, Subtarget);
}
case SCATTER: {
//scatter(base, mask, index, v1, scale);
@@ -17639,14 +15740,13 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain);
}
case PREFETCH: {
SDValue Hint = Op.getOperand(6);
- unsigned HintVal;
- if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
- (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
- llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
@@ -17658,7 +15758,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
case RDTSC: {
SmallVector<SDValue, 2> Results;
- getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
+ getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
+ Results);
return DAG.getMergeValues(Results, dl);
}
// Read Performance Monitoring Counters.
@@ -17672,7 +15773,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_NE, MVT::i8),
+ DAG.getConstant(X86::COND_NE, dl, MVT::i8),
InTrans);
SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
@@ -17684,14 +15785,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
- DAG.getConstant(-1, MVT::i8));
+ DAG.getConstant(-1, dl, MVT::i8));
SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
Op.getOperand(4), GenCF.getValue(1));
SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
Op.getOperand(5), MachinePointerInfo(),
false, false, 0);
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_B, MVT::i8),
+ DAG.getConstant(X86::COND_B, dl, MVT::i8),
Res.getValue(1));
Results.push_back(SetCC);
Results.push_back(Store);
@@ -17704,23 +15805,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Addr = Op.getOperand(2);
SDValue Chain = Op.getOperand(0);
+ EVT VT = DataToCompress.getValueType();
if (isAllOnes(Mask)) // return just a store
return DAG.getStore(Chain, dl, DataToCompress, Addr,
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo(), false, false,
+ VT.getScalarSizeInBits()/8);
- EVT VT = DataToCompress.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask,
DataToCompress, DAG.getUNDEF(VT));
return DAG.getStore(Chain, dl, Compressed, Addr,
- MachinePointerInfo(), false, false, 0);
+ MachinePointerInfo(), false, false,
+ VT.getScalarSizeInBits()/8);
}
case EXPAND_FROM_MEM: {
SDLoc dl(Op);
@@ -17732,22 +15835,22 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
if (isAllOnes(Mask)) // return just a load
return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
- false, 0);
+ false, VT.getScalarSizeInBits()/8);
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorNumElements());
EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
Mask.getValueType().getSizeInBits());
SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
- DAG.getIntPtrConstant(0));
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
- false, false, false, 0);
+ false, false, false,
+ VT.getScalarSizeInBits()/8);
- SmallVector<SDValue, 2> Results;
- Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
- PathThru));
- Results.push_back(Chain);
+ SDValue Results[] = {
+ DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru),
+ Chain};
return DAG.getMergeValues(Results, dl);
}
}
@@ -17767,9 +15870,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
FrameAddr, Offset),
@@ -17783,16 +15885,33 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ EVT VT = Op.getValueType();
+
MFI->setFrameAddressIsTaken(true);
- EVT VT = Op.getValueType();
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
- DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -17819,9 +15938,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -17831,8 +15949,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl (Op);
EVT PtrVT = getPointerTy();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -17841,7 +15958,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
- DAG.getIntPtrConstant(RegInfo->getSlotSize()));
+ DAG.getIntPtrConstant(RegInfo->getSlotSize(),
+ dl));
StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
@@ -17879,7 +15997,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
@@ -17896,12 +16014,12 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// Load the pointer to the nested function into R11.
unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
SDValue Addr = Trmp;
- OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr),
false, false, 0);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
- DAG.getConstant(2, MVT::i64));
+ DAG.getConstant(2, dl, MVT::i64));
OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
MachinePointerInfo(TrmpAddr, 2),
false, false, 2);
@@ -17910,13 +16028,13 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// R10 is specified in X86CallingConv.td
OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
- DAG.getConstant(10, MVT::i64));
- OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ DAG.getConstant(10, dl, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 10),
false, false, 0);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
- DAG.getConstant(12, MVT::i64));
+ DAG.getConstant(12, dl, MVT::i64));
OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
MachinePointerInfo(TrmpAddr, 12),
false, false, 2);
@@ -17924,16 +16042,16 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// Jump to the nested function.
OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
- DAG.getConstant(20, MVT::i64));
- OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ DAG.getConstant(20, dl, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
Addr, MachinePointerInfo(TrmpAddr, 20),
false, false, 0);
unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
- DAG.getConstant(22, MVT::i64));
- OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
- MachinePointerInfo(TrmpAddr, 22),
+ DAG.getConstant(22, dl, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
@@ -17986,32 +16104,32 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDValue Addr, Disp;
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
- DAG.getConstant(10, MVT::i32));
+ DAG.getConstant(10, dl, MVT::i32));
Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
OutChains[0] = DAG.getStore(Root, dl,
- DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+ DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr),
false, false, 0);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
- DAG.getConstant(1, MVT::i32));
+ DAG.getConstant(1, dl, MVT::i32));
OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
MachinePointerInfo(TrmpAddr, 1),
false, false, 1);
const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
- DAG.getConstant(5, MVT::i32));
- OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
- MachinePointerInfo(TrmpAddr, 5),
+ DAG.getConstant(5, dl, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
+ Addr, MachinePointerInfo(TrmpAddr, 5),
false, false, 1);
Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
- DAG.getConstant(6, MVT::i32));
+ DAG.getConstant(6, dl, MVT::i32));
OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
@@ -18042,8 +16160,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
@@ -18069,20 +16186,20 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
SDValue CWD1 =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x800, MVT::i16)),
- DAG.getConstant(11, MVT::i8));
+ CWD, DAG.getConstant(0x800, DL, MVT::i16)),
+ DAG.getConstant(11, DL, MVT::i8));
SDValue CWD2 =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x400, MVT::i16)),
- DAG.getConstant(9, MVT::i8));
+ CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+ DAG.getConstant(9, DL, MVT::i8));
SDValue RetVal =
DAG.getNode(ISD::AND, DL, MVT::i16,
DAG.getNode(ISD::ADD, DL, MVT::i16,
DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
- DAG.getConstant(1, MVT::i16)),
- DAG.getConstant(3, MVT::i16));
+ DAG.getConstant(1, DL, MVT::i16)),
+ DAG.getConstant(3, DL, MVT::i16));
return DAG.getNode((VT.getSizeInBits() < 16 ?
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
@@ -18108,14 +16225,15 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
SDValue Ops[] = {
Op,
- DAG.getConstant(NumBits+NumBits-1, OpVT),
- DAG.getConstant(X86::COND_E, MVT::i8),
+ DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
Op.getValue(1)
};
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
// Finally xor with NumBits-1.
- Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
@@ -18140,7 +16258,8 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
// And xor with NumBits-1.
- Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
+ DAG.getConstant(NumBits - 1, dl, OpVT));
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
@@ -18160,8 +16279,8 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
// If src is zero (i.e. bsf sets ZF), returns NumBits.
SDValue Ops[] = {
Op,
- DAG.getConstant(NumBits, VT),
- DAG.getConstant(X86::COND_E, MVT::i8),
+ DAG.getConstant(NumBits, dl, VT),
+ DAG.getConstant(X86::COND_E, dl, MVT::i8),
Op.getValue(1)
};
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
@@ -18197,6 +16316,9 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -18204,6 +16326,9 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+ if (Op.getValueType() == MVT::i1)
+ return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1));
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -18215,6 +16340,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ if (VT == MVT::i1)
+ return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
+
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntArith(Op, DAG);
@@ -18222,6 +16350,79 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
+ // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
+ // pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8) {
+ if (Subtarget->hasInt256()) {
+ if (VT == MVT::v32i8) {
+ MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
+ SDValue Lo = DAG.getIntPtrConstant(0, dl);
+ SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+ SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
+ SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
+ SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
+ SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
+ DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
+ }
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::MUL, dl, ExVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and sign extend to i16
+ SDValue ALo, BLo;
+ if (Subtarget->hasSSE41()) {
+ ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+ BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
+ -1, 4, -1, 5, -1, 6, -1, 7};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getBitcast(ExVT, ALo);
+ BLo = DAG.getBitcast(ExVT, BLo);
+ ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and sign extend to i16
+ SDValue AHi, BHi;
+ if (Subtarget->hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+ BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
+ -1, 12, -1, 13, -1, 14, -1, 15};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getBitcast(ExVT, AHi);
+ BHi = DAG.getBitcast(ExVT, BHi);
+ AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, mask the lower 8bits of the lo/hi results and pack
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+ RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+
// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
if (VT == MVT::v4i32) {
assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
@@ -18237,8 +16438,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Now multiply odd parts.
SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
- Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
- Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+ Evens = DAG.getBitcast(VT, Evens);
+ Odds = DAG.getBitcast(VT, Odds);
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
@@ -18266,10 +16467,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// Bit cast to 32-bit vectors for MULUDQ
EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
(VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
- A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
- B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
- Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
- Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+ A = DAG.getBitcast(MulVT, A);
+ B = DAG.getBitcast(MulVT, B);
+ Ahi = DAG.getBitcast(MulVT, Ahi);
+ Bhi = DAG.getBitcast(MulVT, Bhi);
SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
@@ -18331,7 +16532,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
.setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
- return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+ return DAG.getBitcast(VT, CallInfo.first);
}
static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
@@ -18369,12 +16570,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
(!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
// => <2 x i64> <ae|cg>
- SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+ SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
// PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
// => <2 x i64> <bf|dh>
- SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
+ SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
// Shuffle it back into the right order.
SDValue Highs, Lows;
@@ -18394,7 +16593,8 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
// unsigned multiply.
if (IsSigned && !Subtarget->hasSSE41()) {
SDValue ShAmt =
- DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+ DAG.getConstant(31, dl,
+ DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
@@ -18410,6 +16610,53 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getMergeValues(Ops, dl);
}
+// Return true if the requred (according to Opcode) shift-imm form is natively
+// supported by the Subtarget
+static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+ if (VT.getScalarSizeInBits() < 16)
+ return false;
+
+ if (VT.is512BitVector() &&
+ (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI()))
+ return true;
+
+ bool LShift = VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget->hasInt256());
+
+ bool AShift = LShift && (Subtarget->hasVLX() ||
+ (VT != MVT::v2i64 && VT != MVT::v4i64));
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
+// The shift amount is a variable, but it is the same for all vector lanes.
+// These instrcutions are defined together with shift-immediate.
+static
+bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+ return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
+}
+
+// Return true if the requred (according to Opcode) variable-shift form is
+// natively supported by the Subtarget
+static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget,
+ unsigned Opcode) {
+
+ if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16)
+ return false;
+
+ // vXi16 supported only on AVX-512, BWI
+ if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI())
+ return false;
+
+ if (VT.is512BitVector() || Subtarget->hasVLX())
+ return true;
+
+ bool LShift = VT.is128BitVector() || VT.is256BitVector();
+ bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
+ return (Opcode == ISD::SRA) ? AShift : LShift;
+}
+
static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -18417,97 +16664,44 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
+ unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
// Optimize shl/srl/sra with constant shift amount.
if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
uint64_t ShiftAmt = ShiftConst->getZExtValue();
- if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
- (Subtarget->hasInt256() &&
- (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
- (Subtarget->hasAVX512() &&
- (VT == MVT::v8i64 || VT == MVT::v16i32))) {
- if (Op.getOpcode() == ISD::SHL)
- return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
- DAG);
- if (Op.getOpcode() == ISD::SRL)
- return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
- DAG);
- if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
- DAG);
- }
-
- if (VT == MVT::v16i8) {
- if (Op.getOpcode() == ISD::SHL) {
- // Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
- // Zero out the rightmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRL) {
- // Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
- // Zero out the leftmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
+ if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
- // R s>> a === ((R u>> a) ^ m) - m
- SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
- Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
- Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
- return Res;
- }
- llvm_unreachable("Unknown shift opcode.");
- }
+ if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Subtarget->hasInt256() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
+ // Simple i8 add case
+ if (ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
// Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
- SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
- SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
+ SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
@@ -18520,8 +16714,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// R s>> a === ((R u>> a) ^ m) - m
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(NumElts,
+ DAG.getConstant(128 >> ShiftAmt, dl,
+ MVT::i8));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
@@ -18563,19 +16758,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
if (ShAmt != ShiftAmt)
return SDValue();
}
- switch (Op.getOpcode()) {
- default:
- llvm_unreachable("Unknown shift opcode!");
- case ISD::SHL:
- return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
- DAG);
- case ISD::SRL:
- return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
- DAG);
- case ISD::SRA:
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
- DAG);
- }
+ return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
}
return SDValue();
@@ -18588,12 +16771,13 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
- VT == MVT::v4i32 || VT == MVT::v8i16 ||
- (Subtarget->hasInt256() &&
- ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
- VT == MVT::v8i32 || VT == MVT::v16i16)) ||
- (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
+ unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
+
+ unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
+ (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
+
+ if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
SDValue BaseShAmt;
EVT EltVT = VT.getVectorElementType();
@@ -18626,7 +16810,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
if (!BaseShAmt)
// Avoid introducing an extract element from a shuffle.
BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
- DAG.getIntPtrConstant(SplatIdx));
+ DAG.getIntPtrConstant(SplatIdx, dl));
}
}
@@ -18637,54 +16821,12 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
else if (EltVT.bitsLT(MVT::i32))
BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
- switch (Op.getOpcode()) {
- default:
- llvm_unreachable("Unknown shift opcode!");
- case ISD::SHL:
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v2i64:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v4i64:
- case MVT::v8i32:
- case MVT::v16i16:
- case MVT::v16i32:
- case MVT::v8i64:
- return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
- }
- case ISD::SRA:
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v8i32:
- case MVT::v16i16:
- case MVT::v16i32:
- case MVT::v8i64:
- return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
- }
- case ISD::SRL:
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v2i64:
- case MVT::v4i32:
- case MVT::v8i16:
- case MVT::v4i64:
- case MVT::v8i32:
- case MVT::v16i16:
- case MVT::v16i32:
- case MVT::v8i64:
- return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
- }
- }
+ return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
}
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget->is64Bit() &&
- (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
- (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
+ if (!Subtarget->is64Bit() && VT == MVT::v2i64 &&
Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
@@ -18698,18 +16840,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
if (Vals[j] != Amt.getOperand(i + j))
return SDValue();
}
- switch (Op.getOpcode()) {
- default:
- llvm_unreachable("Unknown shift opcode!");
- case ISD::SHL:
- return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
- case ISD::SRL:
- return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
- case ISD::SRA:
- return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
- }
+ return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
}
-
return SDValue();
}
@@ -18719,33 +16851,28 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
SDLoc dl(Op);
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- SDValue V;
assert(VT.isVector() && "Custom lowering only for vector shifts!");
assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
- V = LowerScalarImmediateShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
return V;
- V = LowerScalarVariableShift(Op, DAG, Subtarget);
- if (V.getNode())
+ if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
return V;
- if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+ if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
return Op;
- // AVX2 has VPSLLV/VPSRAV/VPSRLV.
- if (Subtarget->hasInt256()) {
- if (Op.getOpcode() == ISD::SRL &&
- (VT == MVT::v2i64 || VT == MVT::v4i32 ||
- VT == MVT::v4i64 || VT == MVT::v8i32))
- return Op;
- if (Op.getOpcode() == ISD::SHL &&
- (VT == MVT::v2i64 || VT == MVT::v4i32 ||
- VT == MVT::v4i64 || VT == MVT::v8i32))
- return Op;
- if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
- return Op;
+
+ // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+ // shifts per-lane and then shuffle the partial results back together.
+ if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+ // Splat the shift amounts so the scalar shifts above will catch it.
+ SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+ SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+ SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+ SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+ return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
}
// If possible, lower this packed shift into a vector multiply instead of
@@ -18775,7 +16902,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
Elts.push_back(DAG.getUNDEF(SVT));
continue;
}
- Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
+ Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
}
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
return DAG.getNode(ISD::MUL, dl, VT, R, BV);
@@ -18783,10 +16910,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
- Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
- Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
- Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+ DAG.getConstant(0x3f800000U, dl, VT));
+ Op = DAG.getBitcast(MVT::v4f32, Op);
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
}
@@ -18849,40 +16977,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// Replace this node with two shifts followed by a MOVSS/MOVSD.
EVT CastVT = MVT::v4i32;
SDValue Splat1 =
- DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+ DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
SDValue Splat2 =
- DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+ DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
if (TargetOpcode == X86ISD::MOVSD)
CastVT = MVT::v2i64;
- SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
- SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+ SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
+ SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
BitCast1, DAG);
- return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ return DAG.getBitcast(VT, Result);
}
}
if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
- assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+ // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, dl, VT));
- // a = a << 5;
- Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
- Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
-
- // Turn 'a' into a mask suitable for VSELECT
- SDValue VSelM = DAG.getConstant(0x80, VT);
+ SDValue VSelM = DAG.getConstant(0x80, dl, VT);
SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
- SDValue CM1 = DAG.getConstant(0x0f, VT);
- SDValue CM2 = DAG.getConstant(0x3f, VT);
-
- // r = VSELECT(r, psllw(r & (char16)15, 4), a);
- SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
- M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
- M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ // r = VSELECT(r, shl(r, 4), a);
+ SDValue M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(4, dl, VT));
R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
// a += a
@@ -18890,10 +17009,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
- // r = VSELECT(r, psllw(r & (char16)63, 2), a);
- M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
- M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
- M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+ // r = VSELECT(r, shl(r, 2), a);
+ M = DAG.getNode(ISD::SHL, dl, VT, R, DAG.getConstant(2, dl, VT));
R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
// a += a
@@ -18911,14 +17028,32 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
// the extra overheads to get from v16i8 to v8i32 make the existing SSE
// solution better.
if (Subtarget->hasInt256() && VT == MVT::v8i16) {
- MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
+ MVT ExtVT = MVT::v8i32;
unsigned ExtOpc =
Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- R = DAG.getNode(ExtOpc, dl, NewVT, R);
- Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
- }
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
+ if (Subtarget->hasInt256() && VT == MVT::v16i16) {
+ MVT ExtVT = MVT::v8i32;
+ SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
+ SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
+ SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
+ SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R);
+ SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R);
+ ALo = DAG.getBitcast(ExtVT, ALo);
+ AHi = DAG.getBitcast(ExtVT, AHi);
+ RLo = DAG.getBitcast(ExtVT, RLo);
+ RHi = DAG.getBitcast(ExtVT, RHi);
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
+ Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
+ Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
+ }
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.is256BitVector()) {
@@ -18934,12 +17069,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
- SmallVector<SDValue, 4> Amt1Csts;
- SmallVector<SDValue, 4> Amt2Csts;
- for (unsigned i = 0; i != NumElems/2; ++i)
- Amt1Csts.push_back(Amt->getOperand(i));
- for (unsigned i = NumElems/2; i != NumElems; ++i)
- Amt2Csts.push_back(Amt->getOperand(i));
+ SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+ ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+ ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
@@ -19021,7 +17153,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC =
DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(X86::COND_O, MVT::i32),
+ DAG.getConstant(X86::COND_O, DL, MVT::i32),
SDValue(Sum.getNode(), 2));
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
@@ -19034,87 +17166,23 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
SDValue SetCC =
DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
- DAG.getConstant(Cond, MVT::i32),
+ DAG.getConstant(Cond, DL, MVT::i32),
SDValue(Sum.getNode(), 1));
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
- EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- MVT VT = Op.getSimpleValueType();
-
- if (!Subtarget->hasSSE2() || !VT.isVector())
- return SDValue();
-
- unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
- ExtraVT.getScalarType().getSizeInBits();
-
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v8i32:
- case MVT::v16i16:
- if (!Subtarget->hasFp256())
- return SDValue();
- if (!Subtarget->hasInt256()) {
- // needs to be split
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- EVT ExtraEltVT = ExtraVT.getVectorElementType();
- unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
- ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
- ExtraNumElems/2);
- SDValue Extra = DAG.getValueType(ExtraVT);
-
- LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
- LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
- }
- // fall through
- case MVT::v4i32:
- case MVT::v8i16: {
- SDValue Op0 = Op.getOperand(0);
-
- // This is a sign extension of some low part of vector elements without
- // changing the size of the vector elements themselves:
- // Shift-Left + Shift-Right-Algebraic.
- SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
- BitsDiff, DAG);
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
- DAG);
- }
- }
-}
-
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget->hasCmpxchg16b();
else
return false;
}
@@ -19130,16 +17198,17 @@ bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
return needsCmpXchgNb(PTy->getElementType());
}
-bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+TargetLoweringBase::AtomicRMWExpansionKind
+X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
// and default to library calls otherwise.
- if (MemType->getPrimitiveSizeInBits() > NativeWidth)
- return needsCmpXchgNb(MemType);
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
+ return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
+ }
AtomicRMWInst::BinOp Op = AI->getOperation();
switch (Op) {
@@ -19149,13 +17218,14 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:
// It's better to use xadd, xsub or xchg for these in all cases.
- return false;
+ return AtomicRMWExpansionKind::None;
case AtomicRMWInst::Or:
case AtomicRMWInst::And:
case AtomicRMWInst::Xor:
// If the atomicrmw's result isn't actually used, we can just add a "lock"
// prefix to a normal instruction for these operations.
- return !AI->use_empty();
+ return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg
+ : AtomicRMWExpansionKind::None;
case AtomicRMWInst::Nand:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
@@ -19163,7 +17233,7 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
case AtomicRMWInst::UMin:
// These always require a non-trivial set of data operations on x86. We must
// use a cmpxchg loop.
- return true;
+ return AtomicRMWExpansionKind::CmpXChg;
}
}
@@ -19176,9 +17246,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) {
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
@@ -19210,21 +17278,21 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more agressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
- if (SynchScope == SingleThread) {
+ if (SynchScope == SingleThread)
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- } else if (hasMFENCE(Subtarget)) {
- Function *MFence = llvm::Intrinsic::getDeclaration(M,
- Intrinsic::x86_sse2_mfence);
- Builder.CreateCall(MFence);
- } else {
+
+ if (!hasMFENCE(*Subtarget))
// FIXME: it might make sense to use a locked operation here but on a
// different cache-line to prevent cache-line bouncing. In practice it
// is probably a small win, and x86 processors without mfence are rare
// enough that we do not bother.
return nullptr;
- }
+
+ Function *MFence =
+ llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+ Builder.CreateCall(MFence, {});
// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
@@ -19250,13 +17318,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
- SDValue Zero = DAG.getConstant(0, MVT::i32);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(0, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i32), // Segment.
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, dl, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, dl, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
Zero,
Chain
};
@@ -19289,7 +17357,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SDValue Ops[] = { cpIn.getValue(0),
Op.getOperand(1),
Op.getOperand(3),
- DAG.getTargetConstant(size, MVT::i8),
+ DAG.getTargetConstant(size, DL, MVT::i8),
cpIn.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
@@ -19301,7 +17369,8 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
MVT::i32, cpOut.getValue(2));
SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
- DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+ DAG.getConstant(X86::COND_E, DL, MVT::i8),
+ EFLAGS);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
@@ -19330,18 +17399,16 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
SmallVector<SDValue, 16> Elts;
for (unsigned i = 0, e = NumElts; i != e; ++i)
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
- DAG.getIntPtrConstant(i)));
+ DAG.getIntPtrConstant(i, dl)));
// Explicitly mark the extra elements as Undef.
- SDValue Undef = DAG.getUNDEF(SVT);
- for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
- Elts.push_back(Undef);
+ Elts.append(NumElts, DAG.getUNDEF(SVT));
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
- SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+ SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, dl));
}
assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
@@ -19361,137 +17428,241 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
return SDValue();
}
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- SDNode *Node = Op.getNode();
- SDLoc dl(Node);
+/// Compute the horizontal sum of bytes in V for the elements of VT.
+///
+/// Requires V to be a byte vector and VT to be an integer vector type with
+/// wider elements than V's type. The width of the elements of VT determines
+/// how many bytes of V are summed horizontally to produce each element of the
+/// result.
+static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(V);
+ MVT ByteVecVT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ int NumElts = VT.getVectorNumElements();
+ assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
+ "Expected value to have byte element type.");
+ assert(EltVT != MVT::i8 &&
+ "Horizontal byte sum only makes sense for wider elements!");
+ unsigned VecSize = VT.getSizeInBits();
+ assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
+
+ // PSADBW instruction horizontally add all bytes and leave the result in i64
+ // chunks, thus directly computes the pop count for v2i64 and v4i64.
+ if (EltVT == MVT::i64) {
+ SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros);
+ return DAG.getBitcast(VT, V);
+ }
+
+ if (EltVT == MVT::i32) {
+ // We unpack the low half and high half into i32s interleaved with zeros so
+ // that we can use PSADBW to horizontally sum them. The most useful part of
+ // this is that it lines up the results of two PSADBW instructions to be
+ // two v2i64 vectors which concatenated are the 4 population counts. We can
+ // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
+ SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
+ SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
+
+ // Do the horizontal sums into two v2i64s.
+ Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
+ Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+ DAG.getBitcast(ByteVecVT, Low), Zeros);
+ High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
+ DAG.getBitcast(ByteVecVT, High), Zeros);
+
+ // Merge them together.
+ MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
+ V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
+ DAG.getBitcast(ShortVecVT, Low),
+ DAG.getBitcast(ShortVecVT, High));
+
+ return DAG.getBitcast(VT, V);
+ }
+
+ // The only element type left is i16.
+ assert(EltVT == MVT::i16 && "Unknown how to handle type");
+
+ // To obtain pop count for each i16 element starting from the pop count for
+ // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
+ // right by 8. It is important to shift as i16s as i8 vector shift isn't
+ // directly supported.
+ SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT));
+ SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+ V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
+ DAG.getBitcast(ByteVecVT, V));
+ return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter);
+}
+
+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned VecSize = VT.getSizeInBits();
- Op = Op.getOperand(0);
- EVT VT = Op.getValueType();
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
- "CTPOP lowering only implemented for 128/256-bit wide vector types");
+ // Implement a lookup table in register by using an algorithm based on:
+ // http://wm.ite.pl/articles/sse-popcount.html
+ //
+ // The general idea is that every lower byte nibble in the input vector is an
+ // index into a in-register pre-computed pop count table. We then split up the
+ // input vector in two new ones: (1) a vector with only the shifted-right
+ // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+ // masked out higher ones) for each byte. PSHUB is used separately with both
+ // to index the in-register table. Next, both are added and the result is a
+ // i8 vector where each element contains the pop count for input byte.
+ //
+ // To obtain the pop count for elements != i8, we follow up with the same
+ // approach and use additional tricks as described below.
+ //
+ const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+ /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+ /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+ int NumByteElts = VecSize / 8;
+ MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+ SDValue In = DAG.getBitcast(ByteVecVT, Op);
+ SmallVector<SDValue, 16> LUTVec;
+ for (int i = 0; i < NumByteElts; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+ SmallVector<SDValue, 16> Mask0F(NumByteElts,
+ DAG.getConstant(0x0F, DL, MVT::i8));
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+ // High nibbles
+ SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+ SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+ SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+ // Low nibbles
+ SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+ // The input vector is used as the shuffle mask that index elements into the
+ // LUT. After counting low and high nibbles, add the vector to obtain the
+ // final pop count per i8 element.
+ SDValue HighPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+ SDValue LowPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+ SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
- unsigned NumElts = VT.getVectorNumElements();
- EVT EltVT = VT.getVectorElementType();
- unsigned Len = EltVT.getSizeInBits();
+ if (EltVT == MVT::i8)
+ return PopCnt;
+
+ return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+
+static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.is128BitVector() &&
+ "Only 128-bit vector bitmath lowering supported.");
+
+ int VecSize = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+ int Len = EltVT.getSizeInBits();
// This is the vectorized version of the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
// with a minor tweak to use a series of adds + shifts instead of vector
- // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
- //
- // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
- // v8i32 => Always profitable
- //
- // FIXME: There a couple of possible improvements:
- //
- // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
- // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
- //
- assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
- "CTPOP not implemented for this vector element type.");
+ // multiplications. Implemented for all integer vector types. We only use
+ // this when we don't have SSSE3 which allows a LUT-based lowering that is
+ // much faster, even faster than using native popcnt instructions.
+
+ auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
+ MVT VT = V.getSimpleValueType();
+ SmallVector<SDValue, 32> Shifters(
+ VT.getVectorNumElements(),
+ DAG.getConstant(Shifter, DL, VT.getVectorElementType()));
+ return DAG.getNode(OpCode, DL, VT, V,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters));
+ };
+ auto GetMask = [&](SDValue V, APInt Mask) {
+ MVT VT = V.getSimpleValueType();
+ SmallVector<SDValue, 32> Masks(
+ VT.getVectorNumElements(),
+ DAG.getConstant(Mask, DL, VT.getVectorElementType()));
+ return DAG.getNode(ISD::AND, DL, VT, V,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks));
+ };
- // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
- // extra legalization.
- bool NeedsBitcast = EltVT == MVT::i32;
- MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+ // We don't want to incur the implicit masks required to SRL vNi8 vectors on
+ // x86, so set the SRL type to have elements at least i16 wide. This is
+ // correct because all of our SRLs are followed immediately by a mask anyways
+ // that handles any bits that sneak into the high bits of the byte elements.
+ MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
- SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
- SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
- SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
+ SDValue V = Op;
// v = v - ((v >> 1) & 0x55555555...)
- SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
- SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
- SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
- if (NeedsBitcast)
- Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
-
- SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
- SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
- if (NeedsBitcast)
- M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
-
- SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
- if (VT != And.getValueType())
- And = DAG.getNode(ISD::BITCAST, dl, VT, And);
- SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+ SDValue Srl =
+ DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
+ SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+ V = DAG.getNode(ISD::SUB, DL, VT, V, And);
// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
- SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
- SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
- SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
- SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+ SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
+ SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+ V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
- Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
- if (NeedsBitcast) {
- Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
- M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
- Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
- }
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
+ V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
- SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
- SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
- if (VT != AndRHS.getValueType()) {
- AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
- AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
- }
- SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+ // At this point, V contains the byte-wise population count, and we are
+ // merely doing a horizontal sum if necessary to get the wider element
+ // counts.
+ if (EltVT == MVT::i8)
+ return V;
- // v = (v + (v >> 4)) & 0x0F0F0F0F...
- SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
- SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
- Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
- Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
-
- SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
- SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
- if (NeedsBitcast) {
- Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
- M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
- }
- And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
- if (VT != And.getValueType())
- And = DAG.getNode(ISD::BITCAST, dl, VT, And);
-
- // The algorithm mentioned above uses:
- // v = (v * 0x01010101...) >> (Len - 8)
- //
- // Change it to use vector adds + vector shifts which yield faster results on
- // Haswell than using vector integer multiplication.
- //
- // For i32 elements:
- // v = v + (v >> 8)
- // v = v + (v >> 16)
- //
- // For i64 elements:
- // v = v + (v >> 8)
- // v = v + (v >> 16)
- // v = v + (v >> 32)
- //
- Add = And;
- SmallVector<SDValue, 8> Csts;
- for (unsigned i = 8; i <= Len/2; i *= 2) {
- Csts.assign(NumElts, DAG.getConstant(i, EltVT));
- SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
- Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
- Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
- Csts.clear();
+ return LowerHorizontalByteSum(
+ DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
+ DAG);
+}
+
+static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ // FIXME: Need to add AVX-512 support here!
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unknown CTPOP type to handle");
+ SDLoc DL(Op.getNode());
+ SDValue Op0 = Op.getOperand(0);
+
+ if (!Subtarget->hasSSSE3()) {
+ // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+ assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
+ return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
}
- // The result is on the least significant 6-bits on i32 and 7-bits on i64.
- SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
- SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
- SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
- if (NeedsBitcast) {
- Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
- M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+ if (VT.is256BitVector() && !Subtarget->hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extract each 128-bit vector, compute pop count and concat the result.
+ SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL);
+ SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
+ LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
}
- And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
- if (VT != And.getValueType())
- And = DAG.getNode(ISD::BITCAST, dl, VT, And);
- return And;
+ return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
+}
+
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Op.getValueType().isVector() &&
+ "We only do custom lowering for vector population count.");
+ return LowerVectorCTPOP(Op, Subtarget, DAG);
}
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
@@ -19499,7 +17670,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Node);
EVT T = Node->getValueType(0);
SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
- DAG.getConstant(0, T), Node->getOperand(2));
+ DAG.getConstant(0, dl, T), Node->getOperand(2));
return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
cast<AtomicSDNode>(Node)->getMemoryVT(),
Node->getOperand(0),
@@ -19605,19 +17776,110 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
// Returned in bits 0:31 and 32:64 xmm0.
SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
- CallResult.first, DAG.getIntPtrConstant(0));
+ CallResult.first, DAG.getIntPtrConstant(0, dl));
SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
- CallResult.first, DAG.getIntPtrConstant(1));
+ CallResult.first, DAG.getIntPtrConstant(1, dl));
SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
}
+static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
+ EVT VT = N->getValue().getValueType();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
+ SDLoc dl(Op);
+
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values
+ if (N->getNumValues() == 1) {
+ SDValue Index = N->getIndex();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getValueType().is512BitVector())
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+
+ SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 0);
+ }
+ return Op;
+}
+
+static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget->hasAVX512() &&
+ "MGATHER/MSCATTER are supported on AVX-512 arch only");
+
+ MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ EVT VT = Op.getValueType();
+ assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+ SDLoc dl(Op);
+
+ SDValue Index = N->getIndex();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getValueType().is512BitVector()) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ }
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
+SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
+ SelectionDAG &DAG) const {
+ // TODO: Eventually, the lowering of these nodes should be informed by or
+ // deferred to the GC strategy for the function in which they appear. For
+ // now, however, they must be lowered to something. Since they are logically
+ // no-ops in the case of a null GC strategy (or a GC strategy which does not
+ // require special handling for these nodes), lower them as literal NOOPs for
+ // the time being.
+ SmallVector<SDValue, 2> Ops;
+
+ Ops.push_back(Op.getOperand(0));
+ if (Op->getGluedNode())
+ Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+
+ SDLoc OpDL(Op);
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+
+ return NOOP;
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
- case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
@@ -19625,8 +17887,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
- case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -19647,6 +17909,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
@@ -19700,6 +17964,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
+ case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
+ case ISD::GC_TRANSITION_START:
+ return LowerGC_TRANSITION_START(Op, DAG);
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
}
}
@@ -19747,6 +18016,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_TO_SINT:
+ // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert
+ // (FP_TO_SINT (load f16)) to FP_TO_INT*.
+ if (N->getOperand(0).getValueType() == MVT::f16)
+ break;
+ // fallthrough
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -19775,12 +18049,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
N->getOperand(0));
- SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
MVT::f64);
SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
- DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
- Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
return;
@@ -19792,6 +18066,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(V);
return;
}
+ case ISD::FP_EXTEND: {
+ // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
+ // No other ValueType for FP_EXTEND should reach this point.
+ assert(N->getValueType(0) == MVT::v2f32 &&
+ "Do not know how to legalize this Node");
+ return;
+ }
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
@@ -19818,9 +18099,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
- DAG.getConstant(0, HalfT));
+ DAG.getConstant(0, dl, HalfT));
cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
- DAG.getConstant(1, HalfT));
+ DAG.getConstant(1, dl, HalfT));
cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
Regs64bit ? X86::RAX : X86::EAX,
cpInL, SDValue());
@@ -19829,9 +18110,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH;
swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
- DAG.getConstant(0, HalfT));
+ DAG.getConstant(0, dl, HalfT));
swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
- DAG.getConstant(1, HalfT));
+ DAG.getConstant(1, dl, HalfT));
swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
Regs64bit ? X86::RBX : X86::EBX,
swapInL, cpInH.getValue(1));
@@ -19858,7 +18139,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MVT::i32, cpOutH.getValue(2));
SDValue Success =
DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+ DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
@@ -19896,7 +18177,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
MVT::v2f64, N->getOperand(0));
- SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+ SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
if (ExperimentalVectorWideningLegalization) {
// If we are legalizing vectors by widening, we already have the desired
@@ -19908,7 +18189,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVector<SDValue, 8> Elts;
for (unsigned i = 0, e = NumElts; i != e; ++i)
Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
- ToVecInt, DAG.getIntPtrConstant(i)));
+ ToVecInt, DAG.getIntPtrConstant(i, dl)));
Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
}
@@ -19916,8 +18197,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
- switch (Opcode) {
- default: return nullptr;
+ switch ((X86ISD::NodeType)Opcode) {
+ case X86ISD::FIRST_NUMBER: break;
case X86ISD::BSF: return "X86ISD::BSF";
case X86ISD::BSR: return "X86ISD::BSR";
case X86ISD::SHLD: return "X86ISD::SHLD";
@@ -19926,7 +18207,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FANDN: return "X86ISD::FANDN";
case X86ISD::FOR: return "X86ISD::FOR";
case X86ISD::FXOR: return "X86ISD::FXOR";
- case X86ISD::FSRL: return "X86ISD::FSRL";
case X86ISD::FILD: return "X86ISD::FILD";
case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
@@ -19944,9 +18224,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
case X86ISD::CMPMU: return "X86ISD::CMPMU";
+ case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
case X86ISD::FSETCC: return "X86ISD::FSETCC";
+ case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86";
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
@@ -19955,16 +18237,21 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
case X86ISD::Wrapper: return "X86ISD::Wrapper";
case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
+ case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
+ case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
case X86ISD::PINSRB: return "X86ISD::PINSRB";
case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
+ case X86ISD::ADDUS: return "X86ISD::ADDUS";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
@@ -19975,7 +18262,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SMAX: return "X86ISD::SMAX";
case X86ISD::SMIN: return "X86ISD::SMIN";
case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
@@ -20044,6 +18333,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
case X86ISD::SHUFP: return "X86ISD::SHUFP";
+ case X86ISD::SHUF128: return "X86ISD::SHUF128";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
@@ -20057,20 +18347,27 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
- case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
+ case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
+ case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
+ case X86ISD::VRANGE: return "X86ISD::VRANGE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
+ case X86ISD::PSADBW: return "X86ISD::PSADBW";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::MFENCE: return "X86ISD::MFENCE";
+ case X86ISD::SFENCE: return "X86ISD::SFENCE";
+ case X86ISD::LFENCE: return "X86ISD::LFENCE";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
case X86ISD::SAHF: return "X86ISD::SAHF";
@@ -20082,19 +18379,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
+ case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
+ case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
+ case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
+ case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
+ case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
case X86ISD::SELECT: return "X86ISD::SELECT";
- }
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
+ case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
+ case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
+ case X86ISD::ADDS: return "X86ISD::ADDS";
+ case X86ISD::SUBS: return "X86ISD::SUBS";
+ }
+ return nullptr;
}
// isLegalAddressingMode - Return true if the addressing mode represented
// by AM is legal for this target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
- Type *Ty) const {
+ Type *Ty,
+ unsigned AS) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -20236,6 +18554,8 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
@@ -20272,85 +18592,24 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
if (!VT.isSimple())
return false;
- MVT SVT = VT.getSimpleVT();
+ // Not for i1 vectors
+ if (VT.getScalarType() == MVT::i1)
+ return false;
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSizeInBits() == 64)
return false;
- // This is an experimental legality test that is tailored to match the
- // legality test of the experimental lowering more closely. They are gated
- // separately to ease testing of performance differences.
- if (ExperimentalVectorShuffleLegality)
- // We only care that the types being shuffled are legal. The lowering can
- // handle any possible shuffle mask that results.
- return isTypeLegal(SVT);
-
- // If this is a single-input shuffle with no 128 bit lane crossings we can
- // lower it into pshufb.
- if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
- (SVT.is256BitVector() && Subtarget->hasInt256())) {
- bool isLegal = true;
- for (unsigned I = 0, E = M.size(); I != E; ++I) {
- if (M[I] >= (int)SVT.getVectorNumElements() ||
- ShuffleCrosses128bitLane(SVT, I, M[I])) {
- isLegal = false;
- break;
- }
- }
- if (isLegal)
- return true;
- }
-
- // FIXME: blends, shifts.
- return (SVT.getVectorNumElements() == 2 ||
- ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
- isMOVLMask(M, SVT) ||
- isCommutedMOVLMask(M, SVT) ||
- isMOVHLPSMask(M, SVT) ||
- isSHUFPMask(M, SVT) ||
- isSHUFPMask(M, SVT, /* Commuted */ true) ||
- isPSHUFDMask(M, SVT) ||
- isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
- isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
- isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
- isPALIGNRMask(M, SVT, Subtarget) ||
- isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
- (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(VT.getSimpleVT());
}
bool
X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
EVT VT) const {
- if (!VT.isSimple())
- return false;
-
- MVT SVT = VT.getSimpleVT();
-
- // This is an experimental legality test that is tailored to match the
- // legality test of the experimental lowering more closely. They are gated
- // separately to ease testing of performance differences.
- if (ExperimentalVectorShuffleLegality)
- // The new vector shuffle lowering is very good at managing zero-inputs.
- return isShuffleMaskLegal(Mask, VT);
-
- unsigned NumElts = SVT.getVectorNumElements();
- // FIXME: This collection of masks seems suspect.
- if (NumElts == 2)
- return true;
- if (NumElts == 4 && SVT.is128BitVector()) {
- return (isMOVLMask(Mask, SVT) ||
- isCommutedMOVLMask(Mask, SVT, true) ||
- isSHUFPMask(Mask, SVT) ||
- isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
- isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
- Subtarget->hasInt256()));
- }
- return false;
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(Mask, VT);
}
//===----------------------------------------------------------------------===//
@@ -20488,11 +18747,10 @@ static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
return BB;
}
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII,
- const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
-
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
@@ -20514,9 +18772,8 @@ static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
@@ -20528,7 +18785,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
// 9 ) EFLAGS (implicit-def)
assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
- assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+ static_assert(X86::AddrNumOperands == 5,
+ "VAARG_64 assumes 5 address operands");
unsigned DestReg = MI->getOperand(0).getReg();
MachineOperand &Base = MI->getOperand(1);
@@ -20546,7 +18804,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -20802,7 +19060,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
@@ -20885,7 +19143,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -20904,6 +19162,92 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// fallthrough --> copy0MBB
MachineBasicBlock *thisMBB = BB;
MachineFunction *F = BB->getParent();
+
+ // We also lower double CMOVs:
+ // (CMOV (CMOV F, T, cc1), T, cc2)
+ // to two successives branches. For that, we look for another CMOV as the
+ // following instruction.
+ //
+ // Without this, we would add a PHI between the two jumps, which ends up
+ // creating a few copies all around. For instance, for
+ //
+ // (sitofp (zext (fcmp une)))
+ //
+ // we would generate:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // movaps %xmm0, %xmm1
+ // jne .LBB5_2
+ // xorps %xmm1, %xmm1
+ // .LBB5_2:
+ // jp .LBB5_4
+ // movaps %xmm1, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ // because this custom-inserter would have generated:
+ //
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ // | \
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // B: empty
+ // C: Z = PHI [X, A], [Y, B]
+ // D: empty
+ // E: PHI [X, C], [Z, D]
+ //
+ // If we lower both CMOVs in a single step, we can instead generate:
+ //
+ // A
+ // | \
+ // | C
+ // | /|
+ // |/ |
+ // | |
+ // | D
+ // | /
+ // E
+ //
+ // A: X = ...; Y = ...
+ // D: empty
+ // E: PHI [X, A], [X, C], [Y, D]
+ //
+ // Which, in our sitofp/fcmp example, gives us something like:
+ //
+ // ucomiss %xmm1, %xmm0
+ // movss <1.0f>, %xmm0
+ // jne .LBB5_4
+ // jp .LBB5_4
+ // xorps %xmm0, %xmm0
+ // .LBB5_4:
+ // retq
+ //
+ MachineInstr *NextCMOV = nullptr;
+ MachineBasicBlock::iterator NextMIIt =
+ std::next(MachineBasicBlock::iterator(MI));
+ if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() &&
+ NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() &&
+ NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg())
+ NextCMOV = &*NextMIIt;
+
+ MachineBasicBlock *jcc1MBB = nullptr;
+
+ // If we have a double CMOV, we lower it to two successive branches to
+ // the same block. EFLAGS is used by both, so mark it as live in the second.
+ if (NextCMOV) {
+ jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(It, jcc1MBB);
+ jcc1MBB->addLiveIn(X86::EFLAGS);
+ }
+
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(It, copy0MBB);
@@ -20911,10 +19255,11 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI =
- BB->getParent()->getSubtarget().getRegisterInfo();
- if (!MI->killsRegister(X86::EFLAGS) &&
- !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+
+ MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI;
+ if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
sinkMBB->addLiveIn(X86::EFLAGS);
}
@@ -20925,7 +19270,19 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
// Add the true and fallthrough blocks as its successors.
- BB->addSuccessor(copy0MBB);
+ if (NextCMOV) {
+ // The fallthrough block may be jcc1MBB, if we have a double CMOV.
+ BB->addSuccessor(jcc1MBB);
+
+ // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
+ // jump to the sinkMBB.
+ jcc1MBB->addSuccessor(copy0MBB);
+ jcc1MBB->addSuccessor(sinkMBB);
+ } else {
+ BB->addSuccessor(copy0MBB);
+ }
+
+ // The true block target of the first (or only) branch is always sinkMBB.
BB->addSuccessor(sinkMBB);
// Create the conditional branch instruction.
@@ -20933,6 +19290,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+ if (NextCMOV) {
+ unsigned Opc2 = X86::GetCondBranchFromCond(
+ (X86::CondCode)NextCMOV->getOperand(3).getImm());
+ BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
+ }
+
// copy0MBB:
// %FalseValue = ...
// # fallthrough to sinkMBB
@@ -20941,10 +19304,22 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// sinkMBB:
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
// ...
- BuildMI(*sinkMBB, sinkMBB->begin(), DL,
- TII->get(X86::PHI), MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
- .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+ MachineInstrBuilder MIB =
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ // If we have a double CMOV, the second Jcc provides the same incoming
+ // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
+ if (NextCMOV) {
+ MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
+ DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg())
+ .addReg(MI->getOperand(0).getReg());
+ NextCMOV->eraseFromParent();
+ }
MI->eraseFromParent(); // The pseudo instruction is gone now.
return sinkMBB;
@@ -20954,7 +19329,7 @@ MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -21027,10 +19402,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
- const uint32_t *RegMask = MF->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
@@ -21087,7 +19460,6 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(!Subtarget->isTargetMachO());
@@ -21106,8 +19478,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
@@ -21116,10 +19487,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
- const uint32_t *RegMask = F->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
@@ -21164,7 +19533,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
@@ -21271,8 +19640,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
@@ -21290,8 +19658,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
- const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
@@ -21314,7 +19682,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
@@ -21329,8 +19697,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
@@ -21449,7 +19816,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
default: llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
@@ -21472,6 +19839,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
@@ -21502,6 +19872,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_V8I1:
+ case X86::CMOV_V16I1:
+ case X86::CMOV_V32I1:
+ case X86::CMOV_V64I1:
return EmitLoweredSelect(MI, BB);
case X86::FP32_TO_INT16_IN_MEM:
@@ -21514,7 +19888,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
@@ -21598,7 +19972,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
@@ -21611,16 +19985,15 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
- Subtarget);
+ return EmitMonitor(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -21874,7 +20247,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDValue(ResNode.getNode(), 1));
}
- return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ return DAG.getBitcast(VT, ResNode);
}
}
@@ -21933,7 +20306,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Just remove no-op shuffle masks.
if (Mask.size() == 1) {
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
/*AddTo*/ true);
return true;
}
@@ -21948,9 +20321,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
- if (FloatDomain) {
- if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
- bool Lo = Mask.equals(0, 0);
+ //
+ // FIXME: Should teach these routines about AVX vector widths.
+ if (FloatDomain && VT.getSizeInBits() == 128) {
+ if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
+ bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
MVT ShuffleVT;
// Check if we have SSE3 which will let us use MOVDDUP. That instruction
@@ -21967,43 +20342,43 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
}
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
- Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+ Op = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
if (Shuffle == X86ISD::MOVDDUP)
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
else
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
/*AddTo*/ true);
return true;
}
if (Subtarget->hasSSE3() &&
- (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
- bool Lo = Mask.equals(0, 0, 2, 2);
+ (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) {
+ bool Lo = Mask.equals({0, 0, 2, 2});
unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
- Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+ Op = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
/*AddTo*/ true);
return true;
}
- if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
- bool Lo = Mask.equals(0, 0, 1, 1);
+ if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) {
+ bool Lo = Mask.equals({0, 0, 1, 1});
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
MVT ShuffleVT = MVT::v4f32;
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
- Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+ Op = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
/*AddTo*/ true);
return true;
}
@@ -22012,12 +20387,12 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain &&
- (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
- Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
- Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
- 15))) {
+ if (!FloatDomain && VT.getSizeInBits() == 128 &&
+ (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
+ Mask.equals(
+ {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) {
bool Lo = Mask[0] == 0;
unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
if (Depth == 1 && Root->getOpcode() == Shuffle)
@@ -22033,11 +20408,11 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
default:
llvm_unreachable("Impossible mask size!");
};
- Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+ Op = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
/*AddTo*/ true);
return true;
}
@@ -22053,9 +20428,9 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// in practice PSHUFB tends to be *very* fast so we're more aggressive.
if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
SmallVector<SDValue, 16> PSHUFBMask;
- assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
- int Ratio = 16 / Mask.size();
- for (unsigned i = 0; i < 16; ++i) {
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Ratio = NumBytes / Mask.size();
+ for (int i = 0; i < NumBytes; ++i) {
if (Mask[i / Ratio] == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
@@ -22063,16 +20438,17 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
int M = Mask[i / Ratio] != SM_SentinelZero
? Ratio * Mask[i / Ratio] + i % Ratio
: 255;
- PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
+ PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
- Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Op = DAG.getBitcast(ByteVT, Input);
DCI.AddToWorklist(Op.getNode());
SDValue PSHUFBMaskOp =
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
- Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+ Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
DCI.AddToWorklist(Op.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op),
/*AddTo*/ true);
return true;
}
@@ -22128,10 +20504,6 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return false; // Bail if we hit a non-vector.
- // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
- // version should be added.
- if (VT.getSizeInBits() != 128)
- return false;
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
@@ -22234,12 +20606,26 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
@@ -22312,7 +20698,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
- if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+ if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+ V.getSimpleValueType().getScalarType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
@@ -22357,14 +20744,14 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
for (int &M : Mask)
M = VMask[M];
V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Rebuild the chain around this new shuffle.
while (!Chain.empty()) {
SDValue W = Chain.pop_back_val();
if (V.getValueType() != W.getOperand(0).getValueType())
- V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
+ V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
switch (W.getOpcode()) {
default:
@@ -22383,7 +20770,7 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
}
}
if (V.getValueType() != N.getValueType())
- V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
+ V = DAG.getBitcast(N.getValueType(), V);
// Return the new chain to replace N.
return V;
@@ -22444,7 +20831,7 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
for (int &M : Mask)
M = VMask[M];
V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
- getV4X86ShuffleImm8ForMask(Mask, DAG));
+ getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
// Check that the shuffles didn't cancel each other out. If not, we need to
// combine to the new one.
@@ -22486,8 +20873,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
- assert(VT == MVT::v8i16);
- (void)VT;
+ assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
return SDValue(); // We combined away this shuffle, so we're done.
@@ -22495,17 +20881,18 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations. Note that it has to at least flip the
// dwords as otherwise it would have been removed as a no-op.
- if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
+ if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getBitcast(DVT, V);
DCI.AddToWorklist(V.getNode());
- V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
- getV4X86ShuffleImm8ForMask(DMask, DAG));
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
+ getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
DCI.AddToWorklist(V.getNode());
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ return DAG.getBitcast(VT, V);
}
// Look for shuffle patterns which can be implemented as a single unpack.
@@ -22533,18 +20920,14 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
int MappedMask[8];
for (int i = 0; i < 8; ++i)
MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
- const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
- const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
- if (std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackLoMask)) ||
- std::equal(std::begin(MappedMask), std::end(MappedMask),
- std::begin(UnpackHiMask))) {
+ if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
+ makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
// We can replace all three shuffles with an unpack.
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+ V = DAG.getBitcast(VT, D.getOperand(0));
DCI.AddToWorklist(V.getNode());
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
- DL, MVT::v8i16, V, V);
+ DL, VT, V, V);
}
}
}
@@ -22602,9 +20985,9 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
- if (!(isShuffleEquivalent(Mask, 0, 3) ||
- isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
- isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+ if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
return SDValue();
// Only specific types are legal at this point, assert so we notice if and
@@ -22684,18 +21067,14 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
CanFold = SVOp->getMaskElt(i) < 0;
if (CanFold) {
- SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
- SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+ SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
+ SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
}
}
}
- // Only handle 128 wide vector from here on.
- if (!VT.is128BitVector())
- return SDValue();
-
// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
@@ -22729,15 +21108,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformTruncateCombine - Converts truncate operation to
-/// a sequence of vector shuffle operations.
-/// It is possible when we truncate 256-bit vector to 128-bit vector
-static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
- return SDValue();
-}
-
/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
/// specific shuffle of a load can be folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
@@ -22760,7 +21130,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (!InVec.hasOneUse())
return SDValue();
EVT BCVT = InVec.getOperand(0).getValueType();
- if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ if (!BCVT.isVector() ||
+ BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
return SDValue();
InVec = InVec.getOperand(0);
}
@@ -22788,7 +21159,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
: InVec.getOperand(1);
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
@@ -22828,11 +21200,30 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
InVec.getOperand(0), Shuffle,
&ShuffleMask[0]);
- Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
+ Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
+/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+/// special and don't usually play with other vector types, it's better to
+/// handle them early to be sure we emit efficient code by avoiding
+/// store-load conversions.
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::x86mmx ||
+ N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
+ N->getOperand(0)->getValueType(0) != MVT::v2i32)
+ return SDValue();
+
+ SDValue V = N->getOperand(0);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
+ N->getValueType(0), V.getOperand(0));
+
+ return SDValue();
+}
+
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
/// into a somewhat faster sequence. For i686, the best sequence is apparently
@@ -22845,16 +21236,43 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return NewOp;
SDValue InputVector = N->getOperand(0);
+ SDLoc dl(InputVector);
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+ MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ MMXSrcOp.getOperand(0));
+ }
- // Detect whether we are trying to convert from mmx to i32 and the bitcast
- // from mmx to v2i32 has a single usage.
- if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
- InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
- InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) &&
+ InputVector.getOpcode() == ISD::BITCAST &&
+ dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
+ uint64_t ExtractedElt =
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ uint64_t InputValue =
+ cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ uint64_t Res = (InputValue >> ExtractedElt) & 1;
+ return DAG.getConstant(Res, dl, MVT::i1);
+ }
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (InputVector.getValueType() != MVT::v4i32)
@@ -22900,17 +21318,16 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
// otherwise bounce the vector off the cache.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vals[4];
- SDLoc dl(InputVector);
if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
- SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+ SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(0, VecIdxTy));
+ DAG.getConstant(0, dl, VecIdxTy));
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(1, VecIdxTy));
+ DAG.getConstant(1, dl, VecIdxTy));
- SDValue ShAmt = DAG.getConstant(32,
+ SDValue ShAmt = DAG.getConstant(32, dl,
DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
@@ -22930,7 +21347,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
// Replace each use (extract) with a load of the appropriate element.
for (unsigned i = 0; i < 4; ++i) {
uint64_t Offset = EltSize * i;
- SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+ SDValue OffsetVal = DAG.getConstant(Offset, dl, TLI.getPointerTy());
SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
StackPtr, OffsetVal);
@@ -23013,16 +21430,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
default: break;
case ISD::SETULT:
case ISD::SETULE:
- Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
+ Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
case ISD::SETUGT:
case ISD::SETUGE:
- Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
+ Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
case ISD::SETLT:
case ISD::SETLE:
- Opc = hasSigned ? X86ISD::SMIN : 0; break;
+ Opc = hasSigned ? X86ISD::SMIN : 0u; break;
case ISD::SETGT:
case ISD::SETGE:
- Opc = hasSigned ? X86ISD::SMAX : 0; break;
+ Opc = hasSigned ? X86ISD::SMAX : 0u; break;
}
// Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -23031,16 +21448,16 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
default: break;
case ISD::SETULT:
case ISD::SETULE:
- Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
+ Opc = hasUnsigned ? X86ISD::UMAX : 0u; break;
case ISD::SETUGT:
case ISD::SETUGE:
- Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
+ Opc = hasUnsigned ? X86ISD::UMIN : 0u; break;
case ISD::SETLT:
case ISD::SETLE:
- Opc = hasSigned ? X86ISD::SMAX : 0; break;
+ Opc = hasSigned ? X86ISD::SMAX : 0u; break;
case ISD::SETGT:
case ISD::SETGE:
- Opc = hasSigned ? X86ISD::SMIN : 0; break;
+ Opc = hasSigned ? X86ISD::SMIN : 0u; break;
}
}
@@ -23291,21 +21708,21 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
TrueC->getAPIntValue().isPowerOf2()) {
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, Cond.getValueType()));
+ DAG.getConstant(1, DL, Cond.getValueType()));
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
- DAG.getConstant(ShAmt, MVT::i8));
+ DAG.getConstant(ShAmt, DL, MVT::i8));
}
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, Cond.getValueType()));
+ DAG.getConstant(1, DL, Cond.getValueType()));
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
@@ -23340,7 +21757,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, Cond.getValueType()));
+ DAG.getConstant(1, DL, Cond.getValueType()));
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
@@ -23348,7 +21765,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
- DAG.getConstant(Diff, Cond.getValueType()));
+ DAG.getConstant(Diff, DL,
+ Cond.getValueType()));
// Add the base if non-zero.
if (FalseC->getAPIntValue() != 0)
@@ -23436,7 +21854,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
(-OpRHSConst->getAPIntValue() - 1))
return DAG.getNode(
X86ISD::SUBUS, DL, VT, OpLHS,
- DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
+ DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
// Another special case: If C was a sign bit, the sub has been
// canonicalized into a xor.
@@ -23450,7 +21868,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// don't rely on particular values of undef lanes.
return DAG.getNode(
X86ISD::SUBUS, DL, VT, OpLHS,
- DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
+ DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
}
}
}
@@ -23518,31 +21936,41 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (TValIsAllOnes && FValIsAllZeros)
Ret = Cond;
else if (TValIsAllOnes)
- Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
- DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
+ Ret =
+ DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
else if (FValIsAllZeros)
Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
- DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
+ DAG.getBitcast(CondVT, LHS));
- return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+ return DAG.getBitcast(VT, Ret);
}
}
- // If we know that this node is legal then we know that it is going to be
- // matched by one of the SSE/AVX BLEND instructions. These instructions only
- // depend on the highest bit in each word. Try to use SimplifyDemandedBits
- // to simplify previous instructions.
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize() && !VT.is512BitVector()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
- // We explicitly check against v8i16 and v16i16 because, although
- // they're marked as Custom, they might only be legal when Cond is a
- // build_vector of constants. This will be taken care in a later
- // condition.
- (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
- VT != MVT::v8i16) &&
- // Don't optimize vector of constants. Those are handled by
- // the generic code and all the bits must be properly set for
- // the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
@@ -23550,6 +21978,31 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
if (BitWidth == 1)
return SDValue();
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getScalarType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+ !Subtarget->hasAVX2())
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
@@ -23598,25 +22051,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
- // We should generate an X86ISD::BLENDI from a vselect if its argument
- // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
- // constants. This specific pattern gets generated when we split a
- // selector for a 512 bit vector in a machine without AVX512 (but with
- // 256-bit vectors), during legalization:
- //
- // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
- //
- // Iff we find this pattern and the build_vectors are built from
- // constants, we translate the vselect into a shuffle_vector that we
- // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if ((N->getOpcode() == ISD::VSELECT ||
- N->getOpcode() == X86ISD::SHRUNKBLEND) &&
- !DCI.isBeforeLegalize()) {
- SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
- if (Shuffle.getNode())
- return Shuffle;
- }
-
return SDValue();
}
@@ -23752,6 +22186,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
return SDValue();
}
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+/// (X86or (X86setcc) (X86setcc))
+/// (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+ X86::CondCode &CC1, SDValue &Flags,
+ bool &isAnd) {
+ if (Cond->getOpcode() == X86ISD::CMP) {
+ ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+ if (!CondOp1C || !CondOp1C->isNullValue())
+ return false;
+
+ Cond = Cond->getOperand(0);
+ }
+
+ isAnd = false;
+
+ SDValue SetCC0, SetCC1;
+ switch (Cond->getOpcode()) {
+ default: return false;
+ case ISD::AND:
+ case X86ISD::AND:
+ isAnd = true;
+ // fallthru
+ case ISD::OR:
+ case X86ISD::OR:
+ SetCC0 = Cond->getOperand(0);
+ SetCC1 = Cond->getOperand(1);
+ break;
+ };
+
+ // Make sure we have SETCC nodes, using the same flags value.
+ if (SetCC0.getOpcode() != X86ISD::SETCC ||
+ SetCC1.getOpcode() != X86ISD::SETCC ||
+ SetCC0->getOperand(1) != SetCC1->getOperand(1))
+ return false;
+
+ CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+ CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+ Flags = SetCC0->getOperand(1);
+ return true;
+}
+
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -23785,7 +22262,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// Extra check as FCMOV only supports a subset of X86 cond.
(FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
- DAG.getConstant(CC, MVT::i8), Flags };
+ DAG.getConstant(CC, DL, MVT::i8), Flags };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
}
@@ -23807,14 +22284,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// shift amount.
if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, MVT::i8), Cond);
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
- DAG.getConstant(ShAmt, MVT::i8));
+ DAG.getConstant(ShAmt, DL, MVT::i8));
if (N->getNumValues() == 2) // Dead flag value?
return DCI.CombineTo(N, Cond, SDValue());
return Cond;
@@ -23824,7 +22301,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
// for any integer data type, including i8/i16.
if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, MVT::i8), Cond);
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
@@ -23862,14 +22339,14 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
if (isFastMultiplier) {
APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
- DAG.getConstant(CC, MVT::i8), Cond);
+ DAG.getConstant(CC, DL, MVT::i8), Cond);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
Cond);
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
- DAG.getConstant(Diff, Cond.getValueType()));
+ DAG.getConstant(Diff, DL, Cond.getValueType()));
// Add the base if non-zero.
if (FalseC->getAPIntValue() != 0)
@@ -23915,12 +22392,50 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
if (CC == X86::COND_E &&
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
- DAG.getConstant(CC, MVT::i8), Cond };
+ DAG.getConstant(CC, DL, MVT::i8), Cond };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
}
}
}
+ // Fold and/or of setcc's to double CMOV:
+ // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+ // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+ //
+ // This combine lets us generate:
+ // cmovcc1 (jcc1 if we don't have CMOV)
+ // cmovcc2 (same)
+ // instead of:
+ // setcc1
+ // setcc2
+ // and/or
+ // cmovne (jne if we don't have CMOV)
+ // When we can't use the CMOV instruction, it might increase branch
+ // mispredicts.
+ // When we can use CMOV, or when there is no mispredict, this improves
+ // throughput and reduces register pressure.
+ //
+ if (CC == X86::COND_NE) {
+ SDValue Flags;
+ X86::CondCode CC0, CC1;
+ bool isAndSetCC;
+ if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+ if (isAndSetCC) {
+ std::swap(FalseOp, TrueOp);
+ CC0 = X86::GetOppositeBranchCondition(CC0);
+ CC1 = X86::GetOppositeBranchCondition(CC1);
+ }
+
+ SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
+ Flags};
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ return CMOV;
+ }
+ }
+
return SDValue();
}
@@ -23931,24 +22446,16 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
default: return SDValue();
// SSE/AVX/AVX2 blend intrinsics.
case Intrinsic::x86_avx2_pblendvb:
- case Intrinsic::x86_avx2_pblendw:
- case Intrinsic::x86_avx2_pblendd_128:
- case Intrinsic::x86_avx2_pblendd_256:
// Don't try to simplify this intrinsic if we don't have AVX2.
if (!Subtarget->hasAVX2())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_avx_blend_pd_256:
- case Intrinsic::x86_avx_blend_ps_256:
case Intrinsic::x86_avx_blendv_pd_256:
case Intrinsic::x86_avx_blendv_ps_256:
// Don't try to simplify this intrinsic if we don't have AVX.
if (!Subtarget->hasAVX())
return SDValue();
// FALL-THROUGH
- case Intrinsic::x86_sse41_pblendw:
- case Intrinsic::x86_sse41_blendpd:
- case Intrinsic::x86_sse41_blendps:
case Intrinsic::x86_sse41_blendvps:
case Intrinsic::x86_sse41_blendvpd:
case Intrinsic::x86_sse41_pblendvb: {
@@ -24020,8 +22527,9 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
// Replace this packed shift intrinsic with a target independent
// shift dag node.
- SDValue Splat = DAG.getConstant(C, VT);
- return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+ SDLoc DL(N);
+ SDValue Splat = DAG.getConstant(C, DL, VT);
+ return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat);
}
}
}
@@ -24035,7 +22543,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
+ if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -24071,17 +22579,17 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
SDValue NewMul;
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+ DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
- DAG.getConstant(MulAmt1, VT));
+ DAG.getConstant(MulAmt1, DL, VT));
if (isPowerOf2_64(MulAmt2))
NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
- DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+ DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
- DAG.getConstant(MulAmt2, VT));
+ DAG.getConstant(MulAmt2, DL, VT));
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, NewMul, false);
@@ -24108,9 +22616,11 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
APInt ShAmt = N1C->getAPIntValue();
Mask = Mask.shl(ShAmt);
- if (Mask != 0)
- return DAG.getNode(ISD::AND, SDLoc(N), VT,
- N00, DAG.getConstant(Mask, VT));
+ if (Mask != 0) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::AND, DL, VT,
+ N00, DAG.getConstant(Mask, DL, VT));
+ }
}
}
@@ -24240,7 +22750,8 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
if (Subtarget->hasAVX512()) {
SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
- CMP01, DAG.getConstant(x86cc, MVT::i8));
+ CMP01,
+ DAG.getConstant(x86cc, DL, MVT::i8));
if (N->getValueType(0) != MVT::i1)
return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
FSetCC);
@@ -24248,7 +22759,8 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
- DAG.getConstant(x86cc, MVT::i8));
+ DAG.getConstant(x86cc, DL,
+ MVT::i8));
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
@@ -24261,17 +22773,17 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
// and work with those going forward.
SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
OnesOrZeroesF);
- SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
- Vector64);
+ SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
- Vector32, DAG.getIntPtrConstant(0));
+ Vector32, DAG.getIntPtrConstant(0, DL));
IntVT = MVT::i32;
}
- SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
+ SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
- DAG.getConstant(1, IntVT));
- SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+ DAG.getConstant(1, DL, IntVT));
+ SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+ ANDed);
return OneBitOfTruth;
}
}
@@ -24383,7 +22895,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
APInt Mask = APInt::getAllOnesValue(InBits);
Mask = Mask.zext(VT.getScalarType().getSizeInBits());
return DAG.getNode(ISD::AND, DL, VT,
- Op, DAG.getConstant(Mask, VT));
+ Op, DAG.getConstant(Mask, DL, VT));
}
case ISD::SIGN_EXTEND:
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
@@ -24393,24 +22905,116 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
}
}
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // A vector zext_in_reg may be represented as a shuffle,
+ // feeding into a bitcast (this represents anyext) feeding into
+ // an and with a mask.
+ // We'd like to try to combine that into a shuffle with zero
+ // plus a bitcast, removing the and.
+ if (N0.getOpcode() != ISD::BITCAST ||
+ N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ // The other side of the AND should be a splat of 2^C, where C
+ // is the number of bits in the source type.
+ if (N1.getOpcode() == ISD::BITCAST)
+ N1 = N1.getOperand(0);
+ if (N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+ BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+ ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+ EVT SrcType = Shuffle->getValueType(0);
+
+ // We expect a single-source shuffle
+ if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+ SplatBitSize, HasAnyUndefs))
+ return SDValue();
+
+ unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+ // Make sure the splat matches the mask we expect
+ if (SplatBitSize > ResSize ||
+ (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+ return SDValue();
+
+ // Make sure the input and output size make sense
+ if (SrcSize >= ResSize || ResSize % SrcSize)
+ return SDValue();
+
+ // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+ // The number of u's between each two values depends on the ratio between
+ // the source and dest type.
+ unsigned ZextRatio = ResSize / SrcSize;
+ bool IsZext = true;
+ for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+ if (i % ZextRatio) {
+ if (Shuffle->getMaskElt(i) > 0) {
+ // Expected undef
+ IsZext = false;
+ break;
+ }
+ } else {
+ if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+ // Expected element number
+ IsZext = false;
+ break;
+ }
+ }
+ }
+
+ if (!IsZext)
+ return SDValue();
+
+ // Ok, perform the transformation - replace the shuffle with
+ // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+ // (instead of undef) where the k elements come from the zero vector.
+ SmallVector<int, 8> Mask;
+ unsigned NumElems = SrcType.getVectorNumElements();
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (i % ZextRatio)
+ Mask.push_back(NumElems);
+ else
+ Mask.push_back(i / ZextRatio);
+
+ SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+ Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
+ return DAG.getBitcast(N0.getValueType(), NewShuffle);
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
- if (R.getNode())
+ if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget))
+ return Zext;
+
+ if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
// Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT == MVT::i32 || VT == MVT::i64) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check for BEXTR.
if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -24420,10 +23024,11 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
uint64_t Mask = MaskNode->getZExtValue();
uint64_t Shift = ShiftNode->getZExtValue();
if (isMask_64(Mask)) {
- uint64_t MaskSize = CountPopulation_64(Mask);
+ uint64_t MaskSize = countPopulation(Mask);
if (Shift + MaskSize <= VT.getSizeInBits())
return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
- DAG.getConstant(Shift | (MaskSize << 8), VT));
+ DAG.getConstant(Shift | (MaskSize << 8), DL,
+ VT));
}
}
} // BEXTR
@@ -24438,10 +23043,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (VT != MVT::v2i64 && VT != MVT::v4i64)
return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
//ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
@@ -24532,7 +23133,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN");
Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
- return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ return DAG.getBitcast(VT, Mask);
}
// PBLENDVB only available on SSE 4.1
if (!Subtarget->hasSSE41())
@@ -24540,11 +23141,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
- X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
- Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
- Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+ X = DAG.getBitcast(BlendVT, X);
+ Y = DAG.getBitcast(BlendVT, Y);
+ Mask = DAG.getBitcast(BlendVT, Mask);
Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
- return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+ return DAG.getBitcast(VT, Mask);
}
}
@@ -24553,8 +23154,8 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize =
+ MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
@@ -24642,10 +23243,10 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
// Generate SUB & CMOV.
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
- DAG.getConstant(0, VT), N0.getOperand(0));
+ DAG.getConstant(0, DL, VT), N0.getOperand(0));
SDValue Ops[] = { N0.getOperand(0), Neg,
- DAG.getConstant(X86::COND_GE, MVT::i8),
+ DAG.getConstant(X86::COND_GE, DL, MVT::i8),
SDValue(Neg.getNode(), 1) };
return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
}
@@ -24690,7 +23291,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
SDValue Ptr = Ld->getBasePtr();
- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+ SDValue Increment = DAG.getConstant(16, dl, TLI.getPointerTy());
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems/2);
@@ -24725,7 +23326,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mld->getValueType(0);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
EVT LdVT = Mld->getMemoryVT();
SDLoc dl(Mld);
@@ -24746,14 +23346,15 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
// Convert Src0 value
- SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+ SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
}
@@ -24762,14 +23363,14 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue Mask = Mld->getMask();
if (Mask.getValueType() == VT) {
// Mask and original value have the same type
- NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
ShuffleVec[i] = NumElems*SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, WideVecVT),
+ DAG.getConstant(0, dl, WideVecVT),
&ShuffleVec[0]);
}
else {
@@ -24781,14 +23382,14 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
unsigned NumConcat = WidenNumElts / MaskNumElts;
SmallVector<SDValue, 16> Ops(NumConcat);
- SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
Ops[0] = Mask;
for (unsigned i = 1; i != NumConcat; ++i)
Ops[i] = ZeroVal;
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
-
+
SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
Mld->getBasePtr(), NewMask, WideSrc0,
Mld->getMemoryVT(), Mld->getMemOperand(),
@@ -24805,7 +23406,6 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElems = VT.getVectorNumElements();
EVT StVT = Mst->getMemoryVT();
SDLoc dl(Mst);
@@ -24819,7 +23419,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
"Unexpected size for truncating masked store");
// We are going to use the original vector elt for storing.
// Accumulated smaller vector elements must be a multiple of the store size.
- assert (((NumElems * FromSz) % ToSz) == 0 &&
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
"Unexpected ratio for truncating masked store");
unsigned SizeRatio = FromSz / ToSz;
@@ -24831,13 +23431,14 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+ SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
@@ -24847,13 +23448,13 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Mask = Mst->getMask();
if (Mask.getValueType() == VT) {
// Mask and original value have the same type
- NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ NewMask = DAG.getBitcast(WideVecVT, Mask);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
ShuffleVec[i] = NumElems*SizeRatio;
NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, WideVecVT),
+ DAG.getConstant(0, dl, WideVecVT),
&ShuffleVec[0]);
}
else {
@@ -24865,7 +23466,7 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
unsigned NumConcat = WidenNumElts / MaskNumElts;
SmallVector<SDValue, 16> Ops(NumConcat);
- SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
Ops[0] = Mask;
for (unsigned i = 1; i != NumConcat; ++i)
Ops[i] = ZeroVal;
@@ -24899,7 +23500,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
- SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+ SDValue Stride = DAG.getConstant(16, dl, TLI.getPointerTy());
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
@@ -24939,7 +23540,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+ SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
@@ -24970,9 +23571,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+ SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
- SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, dl,
TLI.getPointerTy());
SDValue Ptr = St->getBasePtr();
@@ -24980,7 +23581,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
StoreType, ShuffWide,
- DAG.getIntPtrConstant(i));
+ DAG.getIntPtrConstant(i, dl));
SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
St->getPointerInfo(), St->isVolatile(),
St->isNonTemporal(), St->getAlignment());
@@ -25001,10 +23602,9 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
- bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
- && Subtarget->hasSSE2();
+ bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+ bool F64IsLegal =
+ !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2();
if ((VT.isVector() ||
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
@@ -25065,7 +23665,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
// Otherwise, lower to two pairs of 32-bit loads / stores.
SDValue LoAddr = Ld->getBasePtr();
SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
- DAG.getConstant(4, MVT::i32));
+ DAG.getConstant(4, LdDL, MVT::i32));
SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
Ld->getPointerInfo(),
@@ -25086,7 +23686,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
LoAddr = St->getBasePtr();
HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
- DAG.getConstant(4, MVT::i32));
+ DAG.getConstant(4, StDL, MVT::i32));
SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
St->getPointerInfo(),
@@ -25099,6 +23699,27 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
MinAlign(St->getAlignment(), 4));
return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
}
+
+ // This is similar to the above case, but here we handle a scalar 64-bit
+ // integer store that is extracted from a vector on a 32-bit target.
+ // If we have SSE2, then we can treat it like a floating-point double
+ // to get past legalization. The execution dependencies fixup pass will
+ // choose the optimal machine instruction for the store if this really is
+ // an integer or v2f32 rather than an f64.
+ if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() &&
+ St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ SDValue OldExtract = St->getOperand(1);
+ SDValue ExtOp0 = OldExtract.getOperand(0);
+ unsigned VecSize = ExtOp0.getValueSizeInBits();
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
+ SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ BitCast, OldExtract.getOperand(1));
+ return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ }
+
return SDValue();
}
@@ -25197,7 +23818,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
// If A and B occur in reverse order in RHS, then "swap" them (which means
// rewriting the mask).
if (A != C)
- CommuteVectorShuffleMask(RMask, NumElts);
+ ShuffleVectorSDNode::commuteMask(RMask);
// At this point LHS and RHS are equivalent to
// LHS = VECTOR_SHUFFLE A, B, LMask
@@ -25261,11 +23882,13 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
// F[X]OR(0.0, x) -> x
- // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
@@ -25296,26 +23919,30 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
- // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
+
+ // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
- // FANDN(x, 0.0) -> 0.0
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
@@ -25391,23 +24018,76 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InVT = N0->getValueType(0);
+ EVT InSVT = InVT.getScalarType();
+ SDLoc DL(N);
// (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
// (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
// This exposes the sext to the sdivrem lowering, so that it directly extends
// from AH (which we otherwise need to do contortions to access).
if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
- N0.getValueType() == MVT::i8 && VT == MVT::i32) {
- SDLoc dl(N);
+ InVT == MVT::i8 && VT == MVT::i32) {
SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
- SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
+ SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys,
N0.getOperand(0), N0.getOperand(1));
DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
return R.getValue(1);
}
- if (!DCI.isBeforeLegalizeOps())
+ if (!DCI.isBeforeLegalizeOps()) {
+ if (N0.getValueType() == MVT::i1) {
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ SDValue AllOnes =
+ DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+ }
return SDValue();
+ }
+
+ if (VT.isVector()) {
+ auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) {
+ EVT InVT = N->getValueType(0);
+ EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
+ 128 / InVT.getScalarSizeInBits());
+ SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(),
+ DAG.getUNDEF(InVT));
+ Opnds[0] = N;
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
+ };
+
+ // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG
+ // which ensures lowering to X86ISD::VSEXT (pmovsx*).
+ if (VT.getSizeInBits() == 128 &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ SDValue ExOp = ExtendToVec128(DL, N0);
+ return DAG.getSignExtendVectorInReg(ExOp, DL, VT);
+ }
+
+ // On pre-AVX2 targets, split into 128-bit nodes of
+ // ISD::SIGN_EXTEND_VECTOR_INREG.
+ if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) &&
+ (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) &&
+ (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) {
+ unsigned NumVecs = VT.getSizeInBits() / 128;
+ unsigned NumSubElts = 128 / SVT.getSizeInBits();
+ EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
+ EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
+
+ SmallVector<SDValue, 8> Opnds;
+ for (unsigned i = 0, Offset = 0; i != NumVecs;
+ ++i, Offset += NumSubElts) {
+ SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
+ DAG.getIntPtrConstant(Offset, DL));
+ SrcVec = ExtendToVec128(DL, SrcVec);
+ SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT);
+ Opnds.push_back(SrcVec);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
+ }
+ }
if (!Subtarget->hasFp256())
return SDValue();
@@ -25483,7 +24163,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, VT));
+ DAG.getConstant(1, dl, VT));
}
}
@@ -25495,7 +24175,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::AND, dl, VT,
DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, VT));
+ DAG.getConstant(1, dl, VT));
}
}
if (VT.is256BitVector()) {
@@ -25534,18 +24214,18 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
+ SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
LHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
- DAG.getConstant(0, addV.getValueType()), CC);
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
}
if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
- SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
+ SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
RHS.getOperand(1));
- return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
- DAG.getConstant(0, addV.getValueType()), CC);
+ return DAG.getSetCC(DL, N->getValueType(0), addV,
+ DAG.getConstant(0, DL, addV.getValueType()), CC);
}
if (VT.getScalarType() == MVT::i1 &&
@@ -25569,12 +24249,12 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
assert(VT == LHS.getOperand(0).getValueType() &&
"Uexpected operand type");
if (CC == ISD::SETGT)
- return DAG.getConstant(0, VT);
+ return DAG.getConstant(0, DL, VT);
if (CC == ISD::SETLE)
- return DAG.getConstant(1, VT);
+ return DAG.getConstant(1, DL, VT);
if (CC == ISD::SETEQ || CC == ISD::SETGE)
return DAG.getNOT(DL, LHS.getOperand(0), VT);
-
+
assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
"Unexpected condition code!");
return LHS.getOperand(0);
@@ -25584,6 +24264,24 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+ SelectionDAG &DAG) {
+ SDLoc dl(Load);
+ MVT VT = Load->getSimpleValueType(0);
+ MVT EVT = VT.getVectorElementType();
+ SDValue Addr = Load->getOperand(1);
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+ DAG.getConstant(Index * EVT.getStoreSize(), dl,
+ Addr.getSimpleValueType()));
+
+ SDValue NewLoad =
+ DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Load->getMemOperand(), 0, EVT.getStoreSize()));
+ return NewLoad;
+}
+
static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
@@ -25595,20 +24293,47 @@ static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
if (MayFoldLoad(Ld)) {
// Extract the countS bits from the immediate so we can get the proper
// address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, interps doesn't use
+ // When the second source op is a memory address, insertps doesn't use
// countS and just gets an f32 from that address.
unsigned DestIndex =
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
- } else
- return SDValue();
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+ }
+ return SDValue();
+}
+
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
}
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
@@ -25619,12 +24344,14 @@ static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
if (VT == MVT::i8)
return DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
- DAG.getConstant(1, VT));
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS),
+ DAG.getConstant(1, DL, VT));
assert (VT == MVT::i1 && "Unexpected type for SECCC node");
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ EFLAGS));
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
@@ -25663,7 +24390,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
- SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
}
@@ -25685,7 +24412,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
- SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
Flags);
}
@@ -25729,10 +24456,10 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
// DAG.
SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
- SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+ SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
N->getOperand(0)->getOperand(0), MaskConst);
- SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+ SDValue Res = DAG.getBitcast(VT, NewAnd);
return Res;
}
@@ -25740,7 +24467,7 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86TargetLowering *XTLI) {
+ const X86Subtarget *Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -25764,12 +24491,16 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
if (Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
EVT VT = Ld->getValueType(0);
+
+ // This transformation is not supported if the result type is f16
+ if (N->getValueType(0) == MVT::f16)
+ return SDValue();
+
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !XTLI->getSubtarget()->is64Bit() &&
- VT == MVT::i64) {
- SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
- Ld->getChain(), Op0, DAG);
+ !Subtarget->is64Bit() && VT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
@@ -25790,12 +24521,13 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
SDValue(N, 1).use_empty()) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
- SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
+ SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B,MVT::i8),
+ DAG.getConstant(X86::COND_B, DL,
+ MVT::i8),
N->getOperand(2)),
- DAG.getConstant(1, VT));
+ DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
}
@@ -25830,16 +24562,17 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
SDValue CmpOp0 = Cmp.getOperand(0);
SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
- DAG.getConstant(1, CmpOp0.getValueType()));
+ DAG.getConstant(1, DL, CmpOp0.getValueType()));
SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
if (CC == X86::COND_NE)
return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
+ DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
+ NewCmp);
return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
DL, OtherVal.getValueType(), OtherVal,
- DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
+ DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
}
/// PerformADDCombine - Do target-specific dag combines on integer adds.
@@ -25875,9 +24608,9 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
Op1.getOperand(0),
- DAG.getConstant(~XorC, VT));
+ DAG.getConstant(~XorC, SDLoc(Op1), VT));
return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
- DAG.getConstant(C->getAPIntValue()+1, VT));
+ DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
}
}
@@ -25926,8 +24659,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
// In this case, the inner vzext is completely dead because we're going to
// only look at bits inside of the low element. Just do the outer vzext on
// a bitcast of the input to the inner.
- return DAG.getNode(X86ISD::VZEXT, DL, VT,
- DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
}
// Check if we can bypass extracting and re-inserting an element of an input
@@ -25947,9 +24679,9 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
OrigVT.getVectorNumElements() / Ratio);
OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0, DL));
}
- Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+ Op = DAG.getBitcast(OpVT, OrigV);
return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
}
}
@@ -25968,6 +24700,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
@@ -25983,7 +24716,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -25999,7 +24732,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND_INREG:
return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
- case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
@@ -26022,9 +24754,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS:
- return PerformINSERTPSCombine(N, DAG, Subtarget);
- case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS: {
+ if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
+ break;
+ }
+ case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
}
return SDValue();
@@ -26131,27 +24866,23 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
-namespace {
- // Helper to match a string separated by whitespace.
- bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
- s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+// Helper to match a string separated by whitespace.
+static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
+ S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
- for (unsigned i = 0, e = args.size(); i != e; ++i) {
- StringRef piece(*args[i]);
- if (!s.startswith(piece)) // Check if the piece matches.
- return false;
-
- s = s.substr(piece.size());
- StringRef::size_type pos = s.find_first_not_of(" \t");
- if (pos == 0) // We matched a prefix.
- return false;
+ for (StringRef Piece : Pieces) {
+ if (!S.startswith(Piece)) // Check if the piece matches.
+ return false;
- s = s.substr(pos);
- }
+ S = S.substr(Piece.size());
+ StringRef::size_type Pos = S.find_first_not_of(" \t");
+ if (Pos == 0) // We matched a prefix.
+ return false;
- return s.empty();
+ S = S.substr(Pos);
}
- const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+
+ return S.empty();
}
static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
@@ -26191,12 +24922,12 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
// ops instead of emitting the bswap asm. For now, we don't support 486 or
// lower so don't worry about this.
// bswap $0
- if (matchAsm(AsmPieces[0], "bswap", "$0") ||
- matchAsm(AsmPieces[0], "bswapl", "$0") ||
- matchAsm(AsmPieces[0], "bswapq", "$0") ||
- matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
- matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+ if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
+ matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
+ matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -26205,8 +24936,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
// rorw $$8, ${0:w} --> llvm.bswap.i16
if (CI->getType()->isIntegerTy(16) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
- matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+ (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
+ matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -26218,9 +24949,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
case 3:
if (CI->getType()->isIntegerTy(32) &&
IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
- matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
- matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
- matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+ matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
+ matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
+ matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -26235,9 +24966,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
- matchAsm(AsmPieces[1], "bswap", "%edx") &&
- matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+ if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
+ matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
+ matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
return IntrinsicLowering::LowerToByteSwap(CI);
}
}
@@ -26373,7 +25104,7 @@ TargetLowering::ConstraintWeight
break;
case 'G':
case 'C':
- if (dyn_cast<ConstantFP>(CallOperandVal)) {
+ if (isa<ConstantFP>(CallOperandVal)) {
weight = CW_Constant;
}
break;
@@ -26428,7 +25159,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'I':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 31) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26436,7 +25168,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'J':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 63) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26444,7 +25177,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'K':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (isInt<8>(C->getSExtValue())) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26453,7 +25187,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
(Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
- Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26461,7 +25196,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'M':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 3) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26469,7 +25205,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26477,7 +25214,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
case 'O':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 127) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26488,7 +25226,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getSExtValue())) {
// Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+ Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
break;
}
// FIXME gcc accepts some relocatable values here too, but only in certain
@@ -26501,7 +25239,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
C->getZExtValue())) {
- Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
+ Op.getValueType());
break;
}
}
@@ -26513,7 +25252,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
// Literal immediates are always ok.
if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
// Widen to 64 bits here to get it sign extended.
- Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+ Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
break;
}
@@ -26571,8 +25310,9 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ const std::string &Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
@@ -26678,7 +25418,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
- Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
@@ -26777,6 +25517,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.first = DestReg;
Res.second = &X86::GR64RegClass;
}
+ } else if (VT != MVT::Other) {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
}
} else if (Res.second == &X86::FR32RegClass ||
Res.second == &X86::FR64RegClass ||
@@ -26802,13 +25546,23 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.second = &X86::VR256RegClass;
else if (X86::VR512RegClass.hasType(VT))
Res.second = &X86::VR512RegClass;
+ else if (VT != MVT::Other) {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
+ }
+ } else if (VT != MVT::Other) {
+ // Type mismatch and not a clobber: Return an error;
+ Res.first = 0;
+ Res.second = nullptr;
}
return Res;
}
int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
- Type *Ty) const {
+ Type *Ty,
+ unsigned AS) const {
// Scaling factors are not free at all.
// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
// will take 2 allocations in the out of order engine instead of 1
@@ -26827,7 +25581,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
// E.g., on Haswell:
// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
- if (isLegalAddressingMode(AM, Ty))
+ if (isLegalAddressingMode(AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1
// as soon as we use a second register.
return AM.Scale != 0;
OpenPOWER on IntegriCloud