summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp3094
1 files changed, 1897 insertions, 1197 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index 04299f3..4af12e4 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49,6 +49,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <bitset>
+#include <cctype>
using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
@@ -62,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
/// simple subregister reference. Idx is an index in the 128 bits we
/// want. It need not be aligned to a 128-bit bounday. That makes
/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
- DebugLoc dl) {
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+ SelectionDAG &DAG, DebugLoc dl) {
EVT VT = Vec.getValueType();
- assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
+ assert(VT.is256BitVector() && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
- int Factor = VT.getSizeInBits()/128;
+ unsigned Factor = VT.getSizeInBits()/128;
EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
VT.getVectorNumElements()/Factor);
// Extract from UNDEF is UNDEF.
if (Vec.getOpcode() == ISD::UNDEF)
- return DAG.getNode(ISD::UNDEF, dl, ResultVT);
-
- if (isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return DAG.getUNDEF(ResultVT);
- // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
- // we can match to VEXTRACTF128.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR
+ // we can match to VEXTRACTF128.
+ unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
- // This is the index of the first element of the 128-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
- * ElemsPerChunk);
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ * ElemsPerChunk);
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
- SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
- VecIdx);
-
- return Result;
- }
+ SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+ VecIdx);
- return SDValue();
+ return Result;
}
/// Generate a DAG to put 128-bits into a vector > 128 bits. This
@@ -104,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec,
/// simple superregister reference. Idx is an index in the 128 bits
/// we want. It need not be aligned to a 128-bit bounday. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result,
- SDValue Vec,
- SDValue Idx,
- SelectionDAG &DAG,
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+ unsigned IdxVal, SelectionDAG &DAG,
DebugLoc dl) {
- if (isa<ConstantSDNode>(Idx)) {
- EVT VT = Vec.getValueType();
- assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
- EVT ElVT = VT.getVectorElementType();
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- EVT ResultVT = Result.getValueType();
+ EVT VT = Vec.getValueType();
+ assert(VT.is128BitVector() && "Unexpected vector size!");
- // Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+ EVT ElVT = VT.getVectorElementType();
+ EVT ResultVT = Result.getValueType();
- // This is the index of the first element of the 128-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
- * ElemsPerChunk);
+ // Insert the relevant 128 bits.
+ unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
- Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
- VecIdx);
- return Result;
- }
+ // This is the index of the first element of the 128-bit chunk
+ // we want.
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+ * ElemsPerChunk);
- return SDValue();
+ SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+ VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+ unsigned NumElems, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+ return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
}
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
@@ -140,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
if (Subtarget->isTargetEnvMacho()) {
if (is64Bit)
- return new X8664_MachoTargetObjectFile();
+ return new X86_64MachoTargetObjectFile();
return new TargetLoweringObjectFileMachO();
}
+ if (Subtarget->isTargetLinux())
+ return new X86LinuxTargetObjectFile();
if (Subtarget->isTargetELF())
return new TargetLoweringObjectFileELF();
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -162,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
TD = getTargetData();
// Set up the TargetLowering object.
- static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+ static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
// X86 is weird, it always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
@@ -171,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
- // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
- if (Subtarget->is64Bit())
+ // For Atom, always use ILP scheduling.
+ if (Subtarget->isAtom())
+ setSchedulingPreference(Sched::ILP);
+ else if (Subtarget->is64Bit())
setSchedulingPreference(Sched::ILP);
- else if (Subtarget->isAtom())
- setSchedulingPreference(Sched::Hybrid);
else
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -215,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Set up the register classes.
- addRegisterClass(MVT::i8, X86::GR8RegisterClass);
- addRegisterClass(MVT::i16, X86::GR16RegisterClass);
- addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+ addRegisterClass(MVT::i8, &X86::GR8RegClass);
+ addRegisterClass(MVT::i16, &X86::GR16RegClass);
+ addRegisterClass(MVT::i32, &X86::GR32RegClass);
if (Subtarget->is64Bit())
- addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+ addRegisterClass(MVT::i64, &X86::GR64RegClass);
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -345,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
- for (unsigned i = 0, e = 4; i != e; ++i) {
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
@@ -492,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setShouldFoldAtomicFences(true);
// Expand certain atomics
- for (unsigned i = 0, e = 4; i != e; ++i) {
+ for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
MVT VT = IntVTs[i];
setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -567,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, X86::FR32RegisterClass);
- addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::FR64RegClass);
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS , MVT::f64, Custom);
@@ -599,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
} else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, X86::FR32RegisterClass);
- addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -632,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
} else if (!TM.Options.UseSoftFloat) {
// f32 and f64 in x87.
// Set up the FP register classes.
- addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
- addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+ addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+ addRegisterClass(MVT::f32, &X86::RFP32RegClass);
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
setOperationAction(ISD::UNDEF, MVT::f32, Expand);
@@ -641,7 +643,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
if (!TM.Options.UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f32 , Expand);
setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f32 , Expand);
setOperationAction(ISD::FCOS , MVT::f64 , Expand);
}
addLegalFPImmediate(APFloat(+0.0)); // FLD0
@@ -660,7 +664,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// Long double always uses X87.
if (!TM.Options.UseSoftFloat) {
- addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+ addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
@@ -705,8 +709,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+ for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
@@ -729,6 +733,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
@@ -764,8 +769,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand);
- for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
setTruncStoreAction((MVT::SimpleValueType)VT,
(MVT::SimpleValueType)InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
@@ -776,7 +781,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// with -msoft-float, disable use of MMX as well.
if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
- addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
+ addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
// No operations on x86mmx supported, everything uses intrinsics.
}
@@ -813,7 +818,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::BITCAST, MVT::v1i64, Expand);
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
- addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
@@ -826,18 +831,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
- addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
// FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
// registers cannot be used even for integer operations.
- addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
- addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
- addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
- addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+ addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+ addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+ addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
setOperationAction(ISD::ADD, MVT::v16i8, Legal);
setOperationAction(ISD::ADD, MVT::v8i16, Legal);
@@ -867,27 +871,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
- for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
- EVT VT = (MVT::SimpleValueType)i;
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
// Do not attempt to custom lower non-power-of-2 vectors
if (!isPowerOf2_32(VT.getVectorNumElements()))
continue;
// Do not attempt to custom lower non-128-bit vectors
if (!VT.is128BitVector())
continue;
- setOperationAction(ISD::BUILD_VECTOR,
- VT.getSimpleVT().SimpleTy, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE,
- VT.getSimpleVT().SimpleTy, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT,
- VT.getSimpleVT().SimpleTy, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
@@ -903,24 +898,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
- for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
- MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
- EVT VT = SVT;
+ for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
// Do not attempt to promote non-128-bit vectors
if (!VT.is128BitVector())
continue;
- setOperationAction(ISD::AND, SVT, Promote);
- AddPromotedToType (ISD::AND, SVT, MVT::v2i64);
- setOperationAction(ISD::OR, SVT, Promote);
- AddPromotedToType (ISD::OR, SVT, MVT::v2i64);
- setOperationAction(ISD::XOR, SVT, Promote);
- AddPromotedToType (ISD::XOR, SVT, MVT::v2i64);
- setOperationAction(ISD::LOAD, SVT, Promote);
- AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64);
- setOperationAction(ISD::SELECT, SVT, Promote);
- AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v2i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v2i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
}
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -1007,16 +1001,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
}
- if (Subtarget->hasSSE42())
- setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
-
if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
- addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
- addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
+ addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8i32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, &X86::VR256RegClass);
setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
@@ -1040,13 +1031,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom);
-
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
@@ -1070,6 +1054,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ if (Subtarget->hasFMA()) {
+ setOperationAction(ISD::FMA, MVT::v8f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v2f64, Custom);
+ setOperationAction(ISD::FMA, MVT::f32, Custom);
+ setOperationAction(ISD::FMA, MVT::f64, Custom);
+ }
+
if (Subtarget->hasAVX2()) {
setOperationAction(ISD::ADD, MVT::v4i64, Legal);
setOperationAction(ISD::ADD, MVT::v8i32, Legal);
@@ -1121,60 +1114,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
// Custom lower several nodes for 256-bit types.
- for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
- EVT VT = SVT;
+ for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+ i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
if (VT.is128BitVector())
- setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Do not attempt to custom lower other non-256-bit vectors
if (!VT.is256BitVector())
continue;
- setOperationAction(ISD::BUILD_VECTOR, SVT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
- for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
- MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
- EVT VT = SVT;
+ for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+ MVT VT = (MVT::SimpleValueType)i;
// Do not attempt to promote non-256-bit vectors
if (!VT.is256BitVector())
continue;
- setOperationAction(ISD::AND, SVT, Promote);
- AddPromotedToType (ISD::AND, SVT, MVT::v4i64);
- setOperationAction(ISD::OR, SVT, Promote);
- AddPromotedToType (ISD::OR, SVT, MVT::v4i64);
- setOperationAction(ISD::XOR, SVT, Promote);
- AddPromotedToType (ISD::XOR, SVT, MVT::v4i64);
- setOperationAction(ISD::LOAD, SVT, Promote);
- AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64);
- setOperationAction(ISD::SELECT, SVT, Promote);
- AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v4i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v4i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
}
}
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+ for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+ VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
Custom);
}
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1218,17 +1211,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::TRUNCATE);
+ setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SINT_TO_FP);
+ setTargetDAGCombine(ISD::SETCC);
+ setTargetDAGCombine(ISD::FP_TO_SINT);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
- if (Subtarget->hasBMI())
- setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
@@ -1243,6 +1240,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setPrefLoopAlignment(4); // 2^4 bytes.
benefitFromCodePlacementOpt = true;
+ // Predictable cmov don't hurt on atom because it's in-order.
+ predictableSelectIsExpensive = !Subtarget->isAtom();
+
setPrefFunctionAlignment(4); // 2^4 bytes.
}
@@ -1276,7 +1276,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
break;
}
}
- return;
}
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -1411,18 +1410,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
default:
return TargetLowering::findRepresentativeClass(VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
- RRC = (Subtarget->is64Bit()
- ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+ RRC = Subtarget->is64Bit() ?
+ (const TargetRegisterClass*)&X86::GR64RegClass :
+ (const TargetRegisterClass*)&X86::GR32RegClass;
break;
case MVT::x86mmx:
- RRC = X86::VR64RegisterClass;
+ RRC = &X86::VR64RegClass;
break;
case MVT::f32: case MVT::f64:
case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
case MVT::v4f32: case MVT::v2f64:
case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
case MVT::v4f64:
- RRC = X86::VR128RegisterClass;
+ RRC = &X86::VR128RegClass;
break;
}
return std::make_pair(RRC, Cost);
@@ -1457,7 +1457,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
bool
X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
+ MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
@@ -1501,6 +1501,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
SDValue ValToCopy = OutVals[i];
EVT ValVT = ValToCopy.getValueType();
+ // Promote values to the appropriate types
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::AExt)
+ ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
// If this is x86-64, and we disabled SSE, we can't return FP values,
// or SSE or MMX vectors.
if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1638,7 +1648,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
SmallVector<CCValAssign, 16> RVLocs;
bool Is64Bit = Subtarget->is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
@@ -1655,7 +1665,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
SDValue Val;
// If this is a call to a function that returns an fp value on the floating
- // point stack, we must guarantee the the value is popped from the stack, so
+ // point stack, we must guarantee the value is popped from the stack, so
// a CopyFromReg is not good enough - the copy instruction may be eliminated
// if the return value is not used. We use the FpPOP_RETVAL instruction
// instead.
@@ -1699,21 +1709,37 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
-static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
if (Outs.empty())
- return false;
+ return NotStructReturn;
- return Outs[0].Flags.isSRet();
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
}
/// ArgsAreStructReturn - Determines whether a function uses struct
/// return semantics.
-static bool
-ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
if (Ins.empty())
- return false;
+ return NotStructReturn;
- return Ins[0].Flags.isSRet();
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
}
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -1850,19 +1876,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
EVT RegVT = VA.getLocVT();
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
- RC = X86::GR32RegisterClass;
+ RC = &X86::GR32RegClass;
else if (Is64Bit && RegVT == MVT::i64)
- RC = X86::GR64RegisterClass;
+ RC = &X86::GR64RegClass;
else if (RegVT == MVT::f32)
- RC = X86::FR32RegisterClass;
+ RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
- RC = X86::FR64RegisterClass;
- else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
- RC = X86::VR256RegisterClass;
- else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
- RC = X86::VR128RegisterClass;
+ RC = &X86::FR64RegClass;
+ else if (RegVT.is256BitVector())
+ RC = &X86::VR256RegClass;
+ else if (RegVT.is128BitVector())
+ RC = &X86::VR128RegClass;
else if (RegVT == MVT::x86mmx)
- RC = X86::VR64RegisterClass;
+ RC = &X86::VR64RegClass;
else
llvm_unreachable("Unknown argument type!");
@@ -2004,7 +2030,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
DAG.getIntPtrConstant(Offset));
unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
SDValue Store =
DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2020,7 +2046,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
SmallVector<SDValue, 11> SaveXMMOps;
SaveXMMOps.push_back(Chain);
- unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+ unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
SaveXMMOps.push_back(ALVal);
@@ -2031,7 +2057,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
- X86::VR128RegisterClass);
+ &X86::VR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
SaveXMMOps.push_back(Val);
}
@@ -2054,7 +2080,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
- ArgsAreStructReturn(Ins))
+ argsAreStructReturn(Ins) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
@@ -2127,19 +2153,24 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
}
SDValue
-X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
- CallingConv::ID CallConv, bool isVarArg,
- bool doesNotRet, bool &isTailCall,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- DebugLoc dl, SelectionDAG &DAG,
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
+ SelectionDAG &DAG = CLI.DAG;
+ DebugLoc &dl = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool &isTailCall = CLI.IsTailCall;
+ bool isVarArg = CLI.IsVarArg;
+
MachineFunction &MF = DAG.getMachineFunction();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isTargetWin64();
bool IsWindows = Subtarget->isTargetWindows();
- bool IsStructRet = CallIsStructReturn(Outs);
+ StructReturnType SR = callIsStructReturn(Outs);
bool IsSibcall = false;
if (MF.getTarget().Options.DisableTailCalls)
@@ -2148,8 +2179,9 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
- Outs, OutVals, Ins, DAG);
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction()->hasStructRetAttr(),
+ Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
@@ -2231,7 +2263,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
break;
case CCValAssign::AExt:
- if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+ if (RegVT.is128BitVector()) {
// Special case: passing MMX values in XMM registers.
Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
@@ -2282,27 +2314,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains[0], MemOpChains.size());
- // Build a sequence of copy-to-reg nodes chained together with token chain
- // and flag operands which copy the outgoing args into registers.
- SDValue InFlag;
- // Tail call byval lowering might overwrite argument registers so in case of
- // tail call optimization the copies to registers are lowered later.
- if (!isTailCall)
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
- Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
- InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
@@ -2340,12 +2357,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
- Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
- DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, MVT::i8)));
}
-
// For tail calls lower the arguments to the 'real' stack slot.
if (isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
@@ -2359,8 +2374,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- // Do not flag preceding copytoreg stuff together with the following stuff.
- InFlag = SDValue();
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@@ -2400,19 +2413,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains2[0], MemOpChains2.size());
- // Copy arguments to their registers.
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
- InFlag =SDValue();
-
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
FPDiff, dl);
}
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
@@ -2514,14 +2528,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
- // Add an implicit use GOT pointer in EBX.
- if (!isTailCall && Subtarget->isPICStyleGOT())
- Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
- // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
- if (Is64Bit && isVarArg && !IsWin64)
- Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2551,7 +2557,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
getTargetMachine().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPush = NumBytes; // Callee pops everything
else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
- IsStructRet)
+ SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
@@ -2743,7 +2749,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ getTargetMachine(), ArgLocs, *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2764,7 +2770,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (Unused) {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ getTargetMachine(), RVLocs, *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
@@ -2778,12 +2784,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs1, *DAG.getContext());
+ getTargetMachine(), RVLocs1, *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
SmallVector<CCValAssign, 16> RVLocs2;
CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs2, *DAG.getContext());
+ getTargetMachine(), RVLocs2, *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
if (RVLocs1.size() != RVLocs2.size())
@@ -2810,7 +2816,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// argument is passed on the stack.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ getTargetMachine(), ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64
if (Subtarget->isTargetWin64()) {
@@ -2872,8 +2878,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
}
FastISel *
-X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
- return X86::createFastISel(funcInfo);
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const {
+ return X86::createFastISel(funcInfo, libInfo);
}
@@ -2911,6 +2918,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::UNPCKH:
case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:
+ case X86ISD::VPERMI:
return true;
}
}
@@ -3051,10 +3059,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
// X > -1 -> X == 0, jump !sign.
RHS = DAG.getConstant(0, RHS.getValueType());
return X86::COND_NS;
- } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
- } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, RHS.getValueType());
return X86::COND_LE;
@@ -3170,12 +3180,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
return false;
}
-/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (L, L+Pos]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
- int Pos, int Size, int Low) {
- for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ unsigned Pos, unsigned Size, int Low) {
+ for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
if (!isUndefOrEqual(Mask[i], Low))
return false;
return true;
@@ -3194,8 +3204,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
- if (VT != MVT::v8i16)
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+ if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
return false;
// Lower quadword copied in order or undef.
@@ -3204,16 +3214,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
// Upper quadword shuffled.
for (unsigned i = 4; i != 8; ++i)
- if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+ if (!isUndefOrInRange(Mask[i], 4, 8))
return false;
+ if (VT == MVT::v16i16) {
+ // Lower quadword copied in order or undef.
+ if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+ return false;
+
+ // Upper quadword shuffled.
+ for (unsigned i = 12; i != 16; ++i)
+ if (!isUndefOrInRange(Mask[i], 12, 16))
+ return false;
+ }
+
return true;
}
/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
- if (VT != MVT::v8i16)
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+ if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
return false;
// Upper quadword copied in order.
@@ -3222,9 +3243,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
// Lower quadword shuffled.
for (unsigned i = 0; i != 4; ++i)
- if (Mask[i] >= 4)
+ if (!isUndefOrInRange(Mask[i], 0, 4))
+ return false;
+
+ if (VT == MVT::v16i16) {
+ // Upper quadword copied in order.
+ if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
return false;
+ // Lower quadword shuffled.
+ for (unsigned i = 8; i != 12; ++i)
+ if (!isUndefOrInRange(Mask[i], 8, 12))
+ return false;
+ }
+
return true;
}
@@ -3374,11 +3406,11 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
- unsigned NumElems = VT.getVectorNumElements();
-
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
+ unsigned NumElems = VT.getVectorNumElements();
+
if (NumElems != 4)
return false;
@@ -3393,11 +3425,11 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
- unsigned NumElems = VT.getVectorNumElements();
-
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
+ unsigned NumElems = VT.getVectorNumElements();
+
if (NumElems != 4)
return false;
@@ -3410,7 +3442,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
unsigned NumElems = VT.getVectorNumElements();
@@ -3418,11 +3450,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
if (NumElems != 2 && NumElems != 4)
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i + NumElems))
return false;
- for (unsigned i = NumElems/2; i != NumElems; ++i)
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
@@ -3432,23 +3464,71 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+ if (!VT.is128BitVector())
+ return false;
+
unsigned NumElems = VT.getVectorNumElements();
- if ((NumElems != 2 && NumElems != 4)
- || VT.getSizeInBits() > 128)
+ if (NumElems != 2 && NumElems != 4)
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
- for (unsigned i = 0; i != NumElems/2; ++i)
- if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask[i + e], i + NumElems))
return false;
return true;
}
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ DebugLoc dl = SVOp->getDebugLoc();
+
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return SDValue();
+
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // These are the special masks that may be optimized.
+ static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ bool MatchEvenMask = true;
+ bool MatchOddMask = true;
+ for (int i=0; i<8; ++i) {
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+ MatchEvenMask = false;
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+ MatchOddMask = false;
+ }
+ static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+ static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+ const int *CompactionMask;
+ if (MatchEvenMask)
+ CompactionMask = CompactionMaskEven;
+ else if (MatchOddMask)
+ CompactionMask = CompactionMaskOdd;
+ else
+ return SDValue();
+
+ SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+ SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+ UndefNode, CompactionMask);
+ SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+ UndefNode, CompactionMask);
+ static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+ return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -3606,7 +3686,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
if (VT.getVectorElementType().getSizeInBits() < 32)
return false;
- if (VT.getSizeInBits() == 256)
+ if (!VT.is128BitVector())
return false;
unsigned NumElts = VT.getVectorNumElements();
@@ -3628,7 +3708,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
/// The first half comes from the second half of V1 and the second half from the
/// the second half of V2.
static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
- if (!HasAVX || VT.getSizeInBits() != 256)
+ if (!HasAVX || !VT.is256BitVector())
return false;
// The shuffle result is divided into half A and half B. In total the two
@@ -3720,9 +3800,10 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
/// element of vector 2 and the other elements to come from vector 1 in order.
static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
bool V2IsSplat = false, bool V2IsUndef = false) {
- unsigned NumOps = VT.getVectorNumElements();
- if (VT.getSizeInBits() == 256)
+ if (!VT.is128BitVector())
return false;
+
+ unsigned NumOps = VT.getVectorNumElements();
if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
return false;
@@ -3788,9 +3869,11 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
/// specifies a shuffle of elements that is suitable for input to 256-bit
/// version of MOVDDUP.
static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
- unsigned NumElts = VT.getVectorNumElements();
+ if (!HasAVX || !VT.is256BitVector())
+ return false;
- if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts != 4)
return false;
for (unsigned i = 0; i != NumElts/2; ++i)
@@ -3806,7 +3889,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
/// specifies a shuffle of elements that is suitable for input to 128-bit
/// version of MOVDDUP.
static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
unsigned e = VT.getVectorNumElements() / 2;
@@ -3880,9 +3963,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
for (unsigned i = 0; i != NumElts; ++i) {
int Elt = N->getMaskElt(i);
if (Elt < 0) continue;
- Elt %= NumLaneElts;
- unsigned ShAmt = i << Shift;
- if (ShAmt >= 8) ShAmt -= 8;
+ Elt &= NumLaneElts - 1;
+ unsigned ShAmt = (i << Shift) % 8;
Mask |= Elt << ShAmt;
}
@@ -3892,30 +3974,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
unsigned Mask = 0;
- // 8 nodes, but we only care about the last 4.
- for (unsigned i = 7; i >= 4; --i) {
- int Val = N->getMaskElt(i);
- if (Val >= 0)
- Mask |= (Val - 4);
- if (i != 4)
- Mask <<= 2;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the last 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i+4);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits.
+ Mask |= Elt << (i * 2);
+ }
}
+
return Mask;
}
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+ "Unsupported vector type for PSHUFHW");
+
+ unsigned NumElts = VT.getVectorNumElements();
+
unsigned Mask = 0;
- // 8 nodes, but we only care about the first 4.
- for (int i = 3; i >= 0; --i) {
- int Val = N->getMaskElt(i);
- if (Val >= 0)
- Mask |= Val;
- if (i != 0)
- Mask <<= 2;
+ for (unsigned l = 0; l != NumElts; l += 8) {
+ // 8 nodes per lane, but we only care about the first 4.
+ for (unsigned i = 0; i < 4; ++i) {
+ int Elt = N->getMaskElt(l+i);
+ if (Elt < 0) continue;
+ Elt &= 0x3; // only 2-bits
+ Mask |= Elt << (i * 2);
+ }
}
+
return Mask;
}
@@ -4016,13 +4116,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i != NumElems; ++i) {
- int idx = SVOp->getMaskElt(i);
- if (idx < 0)
- MaskVec.push_back(idx);
- else if (idx < (int)NumElems)
- MaskVec.push_back(idx + NumElems);
- else
- MaskVec.push_back(idx - NumElems);
+ int Idx = SVOp->getMaskElt(i);
+ if (Idx >= 0) {
+ if (Idx < (int)NumElems)
+ Idx += NumElems;
+ else
+ Idx -= NumElems;
+ }
+ MaskVec.push_back(Idx);
}
return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
SVOp->getOperand(0), &MaskVec[0]);
@@ -4033,7 +4134,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
if (VT.getVectorNumElements() != 4)
return false;
@@ -4090,7 +4191,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
ArrayRef<int> Mask, EVT VT) {
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return false;
if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
@@ -4107,7 +4208,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
for (unsigned i = 0, e = NumElems/2; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i))
return false;
- for (unsigned i = NumElems/2; i != NumElems; ++i)
+ for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
if (!isUndefOrEqual(Mask[i], i+NumElems))
return false;
return true;
@@ -4159,11 +4260,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
+ unsigned Size = VT.getSizeInBits();
// Always build SSE zero vectors as <4 x i32> bitcasted
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
- if (VT.getSizeInBits() == 128) { // SSE
+ if (Size == 128) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4171,7 +4273,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
- } else if (VT.getSizeInBits() == 256) { // AVX
+ } else if (Size == 256) { // AVX
if (Subtarget->hasAVX2()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4183,7 +4285,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
}
- }
+ } else
+ llvm_unreachable("Unexpected vector type");
+
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -4194,25 +4298,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- assert((VT.is128BitVector() || VT.is256BitVector())
- && "Expected a 128-bit or 256-bit vector type");
+ unsigned Size = VT.getSizeInBits();
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
SDValue Vec;
- if (VT.getSizeInBits() == 256) {
+ if (Size == 256) {
if (HasAVX2) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
} else { // AVX
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
- Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
- Vec = Insert128BitVector(InsV, Vec,
- DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+ Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
}
- } else {
+ } else if (Size == 128) {
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
- }
+ } else
+ llvm_unreachable("Unexpected vector type");
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -4255,9 +4356,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
- unsigned Half = NumElems/2;
SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != Half; ++i) {
+ for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
Mask.push_back(i + Half);
Mask.push_back(i + NumElems + Half);
}
@@ -4289,15 +4389,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
EVT VT = V.getValueType();
DebugLoc dl = V.getDebugLoc();
- assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
- && "Vector size not supported");
+ unsigned Size = VT.getSizeInBits();
- if (VT.getSizeInBits() == 128) {
+ if (Size == 128) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
&SplatMask[0]);
- } else {
+ } else if (Size == 256) {
// To use VPERMILPS to splat scalars, the second half of indicies must
// refer to the higher part, which is a duplication of the lower one,
// because VPERMILPS can only handle in-lane permutations.
@@ -4307,7 +4406,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
&SplatMask[0]);
- }
+ } else
+ llvm_unreachable("Vector size not supported");
return DAG.getNode(ISD::BITCAST, dl, VT, V);
}
@@ -4328,9 +4428,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
if (Size == 256) {
- unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
- V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
- if (Idx > 0)
+ V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+ if (EltNo >= NumElems/2)
EltNo -= NumElems/2;
}
@@ -4346,10 +4445,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// into the low and high part. This is necessary because we want
// to use VPERM* to shuffle the vectors
if (Size == 256) {
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- V1 = Insert128BitVector(InsV, V1,
- DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
}
return getLegalSplat(DAG, V1, EltNo);
@@ -4377,7 +4473,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
/// target specific opcode. Returns true if the Mask could be calculated.
/// Sets IsUnary to true if only uses one source.
-static bool getTargetShuffleMask(SDNode *N, EVT VT,
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
SmallVectorImpl<int> &Mask, bool &IsUnary) {
unsigned NumElems = VT.getVectorNumElements();
SDValue ImmN;
@@ -4408,12 +4504,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT,
break;
case X86ISD::PSHUFHW:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ IsUnary = true;
+ break;
+ case X86ISD::VPERMI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
@@ -4473,20 +4574,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- unsigned NumElems = VT.getVectorNumElements();
+ MVT ShufVT = V.getValueType().getSimpleVT();
+ unsigned NumElems = ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SDValue ImmN;
bool IsUnary;
- if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
+ if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt < 0)
- return DAG.getUNDEF(VT.getVectorElementType());
+ return DAG.getUNDEF(ShufVT.getVectorElementType());
SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
- : N->getOperand(1);
+ : N->getOperand(1);
return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
Depth+1);
}
@@ -4631,7 +4733,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
// Although the logic below support any bitwidth size, there are no
// shift instructions which handle more than 128-bit vectors.
- if (SVOp->getValueType(0).getSizeInBits() > 128)
+ if (!SVOp->getValueType(0).is128BitVector())
return false;
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -4726,7 +4828,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, DebugLoc dl) {
- assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
+ assert(VT.is128BitVector() && "Unknown type for VShift");
EVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4794,7 +4896,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
@@ -4802,7 +4904,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
false, false, false, 0);
SmallVector<int, 8> Mask;
- for (int i = 0; i < NumElems; ++i)
+ for (unsigned i = 0; i != NumElems; ++i)
Mask.push_back(EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
@@ -4866,8 +4968,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
LDBase->getPointerInfo(),
LDBase->isVolatile(), LDBase->isNonTemporal(),
LDBase->isInvariant(), LDBase->getAlignment());
- } else if (NumElems == 4 && LastLoadedElt == 1 &&
- DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+ }
+ if (NumElems == 4 && LastLoadedElt == 1 &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode =
@@ -4896,6 +4999,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for broadcast.");
+
SDValue Ld;
bool ConstSplatVal;
@@ -4930,8 +5036,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
return SDValue();
SDValue Sc = Op.getOperand(0);
- if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
- return SDValue();
+ if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.is256BitVector())
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
Ld = Sc.getOperand(0);
ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
@@ -4946,8 +5061,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
}
}
- bool Is256 = VT.getSizeInBits() == 256;
- bool Is128 = VT.getSizeInBits() == 128;
+ bool Is256 = VT.is256BitVector();
// Handle the broadcasting a single constant scalar from the constant pool
// into a vector. On Sandybridge it is still better to load a constant vector
@@ -4957,9 +5071,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
assert(!CVT.isVector() && "Must not broadcast a vector type");
unsigned ScalarSize = CVT.getSizeInBits();
- if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
- (Is128 && (ScalarSize == 32))) {
-
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
const Constant *C = 0;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
@@ -4971,40 +5083,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
SDValue CP = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(),
- false, false, false, Alignment);
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, Alignment);
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
}
- // The scalar source must be a normal load.
- if (!ISD::isNormalLoad(Ld.getNode()))
- return SDValue();
-
- // Reject loads that have uses of the chain result
- if (Ld->hasAnyUseOfValue(1))
- return SDValue();
-
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
unsigned ScalarSize = Ld.getValueType().getSizeInBits();
- // VBroadcast to YMM
- if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+ // Handle AVX2 in-register broadcasts.
+ if (!IsLoad && Subtarget->hasAVX2() &&
+ (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
- // VBroadcast to XMM
- if (Is128 && (ScalarSize == 32))
+ // The scalar source must be a normal load.
+ if (!IsLoad)
+ return SDValue();
+
+ if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
- // double since there is vbroadcastsd xmm
+ // double since there is no vbroadcastsd xmm
if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
- // VBroadcast to YMM
- if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
-
- // VBroadcast to XMM
- if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64))
+ if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
}
@@ -5012,6 +5116,82 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
return SDValue();
}
+// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
+// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
+// constraint of matching input/output vector elements.
+SDValue
+X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ SDNode *N = Op.getNode();
+ EVT VT = Op.getValueType();
+ unsigned NumElts = Op.getNumOperands();
+
+ // Check supported types and sub-targets.
+ //
+ // Only v2f32 -> v2f64 needs special handling.
+ if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
+ return SDValue();
+
+ SDValue VecIn;
+ EVT VecInVT;
+ SmallVector<int, 8> Mask;
+ EVT SrcVT = MVT::Other;
+
+ // Check the patterns could be translated into X86vfpext.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue In = N->getOperand(i);
+ unsigned Opcode = In.getOpcode();
+
+ // Skip if the element is undefined.
+ if (Opcode == ISD::UNDEF) {
+ Mask.push_back(-1);
+ continue;
+ }
+
+ // Quit if one of the elements is not defined from 'fpext'.
+ if (Opcode != ISD::FP_EXTEND)
+ return SDValue();
+
+ // Check how the source of 'fpext' is defined.
+ SDValue L2In = In.getOperand(0);
+ EVT L2InVT = L2In.getValueType();
+
+ // Check the original type
+ if (SrcVT == MVT::Other)
+ SrcVT = L2InVT;
+ else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
+ return SDValue();
+
+ // Check whether the value being 'fpext'ed is extracted from the same
+ // source.
+ Opcode = L2In.getOpcode();
+
+ // Quit if it's not extracted with a constant index.
+ if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(L2In.getOperand(1)))
+ return SDValue();
+
+ SDValue ExtractedFromVec = L2In.getOperand(0);
+
+ if (VecIn.getNode() == 0) {
+ VecIn = ExtractedFromVec;
+ VecInVT = ExtractedFromVec.getValueType();
+ } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
+ return SDValue();
+
+ Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
+ }
+
+ // Fill the remaining mask as undef.
+ for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
+ Mask.push_back(-1);
+
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+ DAG.getVectorShuffle(VecInVT, DL,
+ VecIn, DAG.getUNDEF(VecInVT),
+ &Mask[0]));
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
@@ -5044,6 +5224,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (Broadcast.getNode())
return Broadcast;
+ SDValue FpExt = LowerVectorFpExtend(Op, DAG);
+ if (FpExt.getNode())
+ return FpExt;
+
unsigned EVTBits = ExtVT.getSizeInBits();
unsigned NumZero = 0;
@@ -5102,8 +5286,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Mask.push_back(Idx);
for (unsigned i = 1; i != VecElts; ++i)
Mask.push_back(i);
- Item = DAG.getVectorShuffle(VecVT, dl, Item,
- DAG.getUNDEF(Item.getValueType()),
+ Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
&Mask[0]);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5120,12 +5303,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
- if (VT.getSizeInBits() == 256) {
+ if (VT.is256BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
Item, DAG.getIntPtrConstant(0));
}
- assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5134,12 +5317,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- if (VT.getSizeInBits() == 256) {
+ if (VT.is256BitVector()) {
SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
- Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
- DAG, dl);
+ Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
} else {
- assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+ assert(VT.is128BitVector() && "Expected an SSE value type!");
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5171,7 +5353,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i < NumElems; i++)
+ for (unsigned i = 0; i != NumElems; ++i)
MaskVec.push_back(i == Idx ? 0 : 1);
return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
}
@@ -5199,7 +5381,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
- if (VT.getSizeInBits() == 256) {
+ if (VT.is256BitVector()) {
SmallVector<SDValue, 32> V;
for (unsigned i = 0; i != NumElems; ++i)
V.push_back(Op.getOperand(i));
@@ -5212,10 +5394,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
NumElems/2);
// Recreate the wider vector with the lower and upper part.
- SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
@@ -5283,7 +5462,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
}
- if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+ if (Values.size() > 1 && VT.is128BitVector()) {
// Check for a build vector of consecutive loads.
for (unsigned i = 0; i < NumElems; ++i)
V[i] = Op.getOperand(i);
@@ -5344,62 +5523,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
-// them in a MMX register. This is better than doing a stack convert.
-static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
- DebugLoc dl = Op.getDebugLoc();
- EVT ResVT = Op.getValueType();
-
- assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
- ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
- int Mask[2];
- SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
- SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
- InVec = Op.getOperand(1);
- if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
- unsigned NumElts = ResVT.getVectorNumElements();
- VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
- VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
- InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
- } else {
- InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
- SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
- Mask[0] = 0; Mask[1] = 2;
- VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
- }
- return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-}
-
// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
// to create 256-bit vectors from two other 128-bit ones.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
EVT ResVT = Op.getValueType();
- assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+ assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
unsigned NumElems = ResVT.getVectorNumElements();
- SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
SDValue
X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
- EVT ResVT = Op.getValueType();
-
assert(Op.getNumOperands() == 2);
- assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
- "Unsupported CONCAT_VECTORS for value type");
-
- // We support concatenate two MMX registers and place them in a MMX register.
- // This is better than doing a stack convert.
- if (ResVT.is128BitVector())
- return LowerMMXCONCAT_VECTORS(Op, DAG);
// 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
// from two other 128-bit ones.
@@ -5407,75 +5548,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
}
// Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
- EVT VT = Op.getValueType();
- EVT InVT = V1.getValueType();
- int MaskSize = VT.getVectorNumElements();
- int InSize = InVT.getVectorNumElements();
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ unsigned NumElems = VT.getVectorNumElements();
if (!Subtarget->hasSSE41())
return SDValue();
- if (MaskSize != InSize)
- return SDValue();
-
- int ISDNo = 0;
+ unsigned ISDNo = 0;
MVT OpTy;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default: return SDValue();
case MVT::v8i16:
- ISDNo = X86ISD::BLENDPW;
- OpTy = MVT::v8i16;
- break;
+ ISDNo = X86ISD::BLENDPW;
+ OpTy = MVT::v8i16;
+ break;
case MVT::v4i32:
case MVT::v4f32:
- ISDNo = X86ISD::BLENDPS;
- OpTy = MVT::v4f32;
- break;
+ ISDNo = X86ISD::BLENDPS;
+ OpTy = MVT::v4f32;
+ break;
case MVT::v2i64:
case MVT::v2f64:
- ISDNo = X86ISD::BLENDPD;
- OpTy = MVT::v2f64;
- break;
+ ISDNo = X86ISD::BLENDPD;
+ OpTy = MVT::v2f64;
+ break;
case MVT::v8i32:
case MVT::v8f32:
- if (!Subtarget->hasAVX())
- return SDValue();
- ISDNo = X86ISD::BLENDPS;
- OpTy = MVT::v8f32;
- break;
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ ISDNo = X86ISD::BLENDPS;
+ OpTy = MVT::v8f32;
+ break;
case MVT::v4i64:
case MVT::v4f64:
- if (!Subtarget->hasAVX())
- return SDValue();
- ISDNo = X86ISD::BLENDPD;
- OpTy = MVT::v4f64;
- break;
- case MVT::v16i16:
- if (!Subtarget->hasAVX2())
- return SDValue();
- ISDNo = X86ISD::BLENDPW;
- OpTy = MVT::v16i16;
- break;
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ ISDNo = X86ISD::BLENDPD;
+ OpTy = MVT::v4f64;
+ break;
}
assert(ISDNo && "Invalid Op Number");
unsigned MaskVals = 0;
- for (int i = 0; i < MaskSize; ++i) {
+ for (unsigned i = 0; i != NumElems; ++i) {
int EltIdx = SVOp->getMaskElt(i);
- if (EltIdx == i || EltIdx == -1)
+ if (EltIdx == (int)i || EltIdx < 0)
MaskVals |= (1<<i);
- else if (EltIdx == (i + MaskSize))
+ else if (EltIdx == (int)(i + NumElems))
continue; // Bit is set to zero;
- else return SDValue();
+ else
+ return SDValue();
}
V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
@@ -5629,13 +5759,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
- if (TwoInputs && (EltIdx >= 16)) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+ int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+ int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
}
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
@@ -5649,13 +5776,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
- if (EltIdx < 16) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+ int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
}
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
@@ -5731,10 +5855,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
- SDValue ExtOp = (EltIdx < 8)
- ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
- DAG.getIntPtrConstant(EltIdx))
- : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+ SDValue ExtOp = (EltIdx < 8) ?
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+ DAG.getIntPtrConstant(EltIdx)) :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
@@ -5755,21 +5879,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
DebugLoc dl = SVOp->getDebugLoc();
ArrayRef<int> MaskVals = SVOp->getMask();
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
- // FIXME: kill V2Only once shuffles are canonizalized by getNode.
- bool V1Only = true;
- bool V2Only = true;
- for (unsigned i = 0; i < 16; ++i) {
- int EltIdx = MaskVals[i];
- if (EltIdx < 0)
- continue;
- if (EltIdx < 16)
- V2Only = false;
- else
- V1Only = false;
- }
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
@@ -5781,23 +5895,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
- bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
- if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
+ if (EltIdx < 0 || EltIdx >= 16)
+ EltIdx = 0x80;
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
- // If all the elements are from V2, assign it to V1 and return after
- // building the first pshufb.
- if (V2Only)
- V1 = V2;
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
- if (!TwoInputs)
+ if (V2IsUndef)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
@@ -5805,11 +5912,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
- if (EltIdx < 16) {
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- continue;
- }
- pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
@@ -5822,7 +5926,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
// the 16 different words that comprise the two doublequadword input vectors.
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
- SDValue NewV = V2Only ? V2 : V1;
+ SDValue NewV = V1;
for (int i = 0; i != 8; ++i) {
int Elt0 = MaskVals[i*2];
int Elt1 = MaskVals[i*2+1];
@@ -5832,9 +5936,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
continue;
// This word of the result is already in the correct place, skip it.
- if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
- continue;
- if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+ if ((Elt0 == i*2) && (Elt1 == i*2+1))
continue;
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
@@ -5896,41 +5998,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG, DebugLoc dl) {
- EVT VT = SVOp->getValueType(0);
- SDValue V1 = SVOp->getOperand(0);
- SDValue V2 = SVOp->getOperand(1);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned NumElems = VT.getVectorNumElements();
- unsigned NewWidth = (NumElems == 4) ? 2 : 4;
- EVT NewVT;
- switch (VT.getSimpleVT().SimpleTy) {
+ MVT NewVT;
+ unsigned Scale;
+ switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected!");
- case MVT::v4f32: NewVT = MVT::v2f64; break;
- case MVT::v4i32: NewVT = MVT::v2i64; break;
- case MVT::v8i16: NewVT = MVT::v4i32; break;
- case MVT::v16i8: NewVT = MVT::v4i32; break;
+ case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break;
+ case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break;
+ case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break;
+ case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break;
+ case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+ case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break;
}
- int Scale = NumElems / NewWidth;
SmallVector<int, 8> MaskVec;
- for (unsigned i = 0; i < NumElems; i += Scale) {
+ for (unsigned i = 0; i != NumElems; i += Scale) {
int StartIdx = -1;
- for (int j = 0; j < Scale; ++j) {
+ for (unsigned j = 0; j != Scale; ++j) {
int EltIdx = SVOp->getMaskElt(i+j);
if (EltIdx < 0)
continue;
- if (StartIdx == -1)
- StartIdx = EltIdx - (EltIdx % Scale);
- if (EltIdx != StartIdx + j)
+ if (StartIdx < 0)
+ StartIdx = (EltIdx / Scale);
+ if (EltIdx != (int)(StartIdx*Scale + j))
return SDValue();
}
- if (StartIdx == -1)
- MaskVec.push_back(-1);
- else
- MaskVec.push_back(StartIdx / Scale);
+ MaskVec.push_back(StartIdx);
}
- V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+ SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+ SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
}
@@ -5973,6 +6071,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+ SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
EVT VT = SVOp->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
@@ -5981,14 +6084,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
DebugLoc dl = SVOp->getDebugLoc();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
- SDValue Shufs[2];
+ SDValue Output[2];
SmallVector<int, 16> Mask;
for (unsigned l = 0; l < 2; ++l) {
// Build a shuffle mask for the output, discovering on the fly which
// input vectors to use as shuffle operands (recorded in InputUsed).
// If building a suitable shuffle vector proves too hard, then bail
- // out with useBuildVector set.
+ // out with UseBuildVector set.
+ bool UseBuildVector = false;
int InputUsed[2] = { -1, -1 }; // Not yet discovered.
unsigned LaneStart = l * NumLaneElems;
for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -6020,38 +6124,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
if (OpNo >= array_lengthof(InputUsed)) {
- // More than two input vectors used! Give up.
- return SDValue();
+ // More than two input vectors used! Give up on trying to create a
+ // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
+ UseBuildVector = true;
+ break;
}
// Add the mask index for the new shuffle vector.
Mask.push_back(Idx + OpNo * NumLaneElems);
}
- if (InputUsed[0] < 0) {
+ if (UseBuildVector) {
+ SmallVector<SDValue, 16> SVOps;
+ for (unsigned i = 0; i != NumLaneElems; ++i) {
+ // The mask element. This indexes into the input.
+ int Idx = SVOp->getMaskElt(i+LaneStart);
+ if (Idx < 0) {
+ SVOps.push_back(DAG.getUNDEF(EltVT));
+ continue;
+ }
+
+ // The input vector this mask element indexes into.
+ int Input = Idx / NumElems;
+
+ // Turn the index into an offset from the start of the input vector.
+ Idx -= Input * NumElems;
+
+ // Extract the vector element by hand.
+ SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+ SVOp->getOperand(Input),
+ DAG.getIntPtrConstant(Idx)));
+ }
+
+ // Construct the output using a BUILD_VECTOR.
+ Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+ SVOps.size());
+ } else if (InputUsed[0] < 0) {
// No input vectors were used! The result is undefined.
- Shufs[l] = DAG.getUNDEF(NVT);
+ Output[l] = DAG.getUNDEF(NVT);
} else {
SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
- DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
- DAG, dl);
+ (InputUsed[0] % 2) * NumLaneElems,
+ DAG, dl);
// If only one input was used, use an undefined vector for the other.
SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
- DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
- DAG, dl);
+ (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
// At least one input vector was used. Create a new shuffle vector.
- Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+ Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
}
Mask.clear();
}
// Concatenate the result back
- SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
- DAG.getConstant(0, MVT::i32), DAG, dl);
- return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
- DAG, dl);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
}
/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6063,7 +6190,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
DebugLoc dl = SVOp->getDebugLoc();
EVT VT = SVOp->getValueType(0);
- assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+ assert(VT.is128BitVector() && "Unsupported vector size");
std::pair<int, int> Locs[4];
int Mask1[] = { -1, -1, -1, -1 };
@@ -6107,7 +6234,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
- } else if (NumLo == 3 || NumHi == 3) {
+ }
+
+ if (NumLo == 3 || NumHi == 3) {
// Otherwise, we must have three elements from one vector, call it X, and
// one element from the other, call it Y. First, use a shufps to build an
// intermediate vector with the one element from Y and the element from X
@@ -6143,17 +6272,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
Mask1[2] = HiIndex & 1 ? 6 : 4;
Mask1[3] = HiIndex & 1 ? 4 : 6;
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
- } else {
- Mask1[0] = HiIndex & 1 ? 2 : 0;
- Mask1[1] = HiIndex & 1 ? 0 : 2;
- Mask1[2] = PermMask[2];
- Mask1[3] = PermMask[3];
- if (Mask1[2] >= 0)
- Mask1[2] += 4;
- if (Mask1[3] >= 0)
- Mask1[3] += 4;
- return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
}
+
+ Mask1[0] = HiIndex & 1 ? 2 : 0;
+ Mask1[1] = HiIndex & 1 ? 0 : 2;
+ Mask1[2] = PermMask[2];
+ Mask1[3] = PermMask[3];
+ if (Mask1[2] >= 0)
+ Mask1[2] += 4;
+ if (Mask1[3] >= 0)
+ Mask1[3] += 4;
+ return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
}
// Break it into (shuffle shuffle_hi, shuffle_lo).
@@ -6302,7 +6431,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
- // If we don't care about the second element, procede to use movss.
+ // If we don't care about the second element, proceed to use movss.
if (SVOp->getMaskElt(1) != -1)
return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
}
@@ -6360,7 +6489,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
- if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
+ VT == MVT::v16i16 || VT == MVT::v32i8) {
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
@@ -6564,11 +6694,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// new vector_shuffle with the corrected mask.p
SmallVector<int, 8> NewMask(M.begin(), M.end());
NormalizeMask(NewMask, NumElems);
- if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+ if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
- } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+ if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
- }
}
if (Commuted) {
@@ -6605,12 +6734,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
}
- if (isPSHUFHWMask(M, VT))
+ if (isPSHUFHWMask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
getShufflePSHUFHWImmediate(SVOp),
DAG);
- if (isPSHUFLWMask(M, VT))
+ if (isPSHUFLWMask(M, VT, HasAVX2))
return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
getShufflePSHUFLWImmediate(SVOp),
DAG);
@@ -6647,7 +6776,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
- SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+ SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
@@ -6689,7 +6818,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// Handle all 128-bit wide vectors with 4 elements, and match them with
// several different shuffle types.
- if (NumElems == 4 && VT.getSizeInBits() == 128)
+ if (NumElems == 4 && VT.is128BitVector())
return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
// Handle general 256-bit shuffles
@@ -6705,7 +6834,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
- if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+ if (!Op.getOperand(0).getValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
@@ -6714,7 +6843,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT.getSizeInBits() == 16) {
+ }
+
+ if (VT.getSizeInBits() == 16) {
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
// If Idx is 0, it's cheaper to do a move instead of a pextrw.
if (Idx == 0)
@@ -6729,7 +6860,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT == MVT::f32) {
+ }
+
+ if (VT == MVT::f32) {
// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
// the result back to FR32 register. It's only worth matching if the
// result has a single use which is a store or a bitcast to i32. And in
@@ -6749,7 +6882,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
Op.getOperand(0)),
Op.getOperand(1));
return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
- } else if (VT == MVT::i32 || VT == MVT::i64) {
+ }
+
+ if (VT == MVT::i32 || VT == MVT::i64) {
// ExtractPS/pextrq works with constant index.
if (isa<ConstantSDNode>(Op.getOperand(1)))
return Op;
@@ -6769,22 +6904,22 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
- if (VecVT.getSizeInBits() == 256) {
+ if (VecVT.is256BitVector()) {
DebugLoc dl = Op.getNode()->getDebugLoc();
unsigned NumElems = VecVT.getVectorNumElements();
SDValue Idx = Op.getOperand(1);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
- bool Upper = IdxVal >= NumElems/2;
- Vec = Extract128BitVector(Vec,
- DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+ Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+ if (IdxVal >= NumElems/2)
+ IdxVal -= NumElems/2;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
+ DAG.getConstant(IdxVal, MVT::i32));
}
- assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
+ assert(VecVT.is128BitVector() && "Unexpected vector length");
if (Subtarget->hasSSE41()) {
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
@@ -6811,7 +6946,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
- } else if (VT.getSizeInBits() == 32) {
+ }
+
+ if (VT.getSizeInBits() == 32) {
unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
if (Idx == 0)
return Op;
@@ -6823,7 +6960,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getIntPtrConstant(0));
- } else if (VT.getSizeInBits() == 64) {
+ }
+
+ if (VT.getSizeInBits() == 64) {
// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
// to match extract_elt for f64.
@@ -6856,7 +6995,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- if (VT.getSizeInBits() == 256)
+ if (!VT.is128BitVector())
return SDValue();
if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
@@ -6876,7 +7015,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
if (N2.getValueType() != MVT::i32)
N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
return DAG.getNode(Opc, dl, VT, N0, N1, N2);
- } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+ }
+
+ if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into these
// bits. For example (insert (extract, 3), 2) could be matched by putting
@@ -6889,8 +7030,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
- } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) &&
- isa<ConstantSDNode>(N2)) {
+ }
+
+ if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
// PINSR* works with constant index.
return Op;
}
@@ -6909,23 +7051,22 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
// If this is a 256-bit vector result, first extract the 128-bit vector,
// insert the element into the extracted half and then place it back.
- if (VT.getSizeInBits() == 256) {
+ if (VT.is256BitVector()) {
if (!isa<ConstantSDNode>(N2))
return SDValue();
// Get the desired 128-bit vector half.
unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
- bool Upper = IdxVal >= NumElems/2;
- SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
- SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
+ SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
// Insert the element into the desired half.
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
- N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
+ bool Upper = IdxVal >= NumElems/2;
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+ DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
// Insert the changed part back to the 256-bit vector
- return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
+ return Insert128BitVector(N0, V, IdxVal, DAG, dl);
}
if (Subtarget->hasSSE41())
@@ -6954,7 +7095,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
- if (OpVT.getSizeInBits() > 128) {
+ if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
EVT VT128 = EVT::getVectorVT(*Context,
OpVT.getVectorElementType(),
@@ -6963,19 +7104,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
// Insert the 128-bit vector.
- return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
- DAG.getConstant(0, MVT::i32),
- DAG, dl);
+ return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
- if (Op.getValueType() == MVT::v1i64 &&
+ if (OpVT == MVT::v1i64 &&
Op.getOperand(0).getValueType() == MVT::i64)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
- assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
- "Expected an SSE type!");
- return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+ return DAG.getNode(ISD::BITCAST, dl, OpVT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
}
@@ -6989,9 +7127,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue Vec = Op.getNode()->getOperand(0);
SDValue Idx = Op.getNode()->getOperand(1);
- if (Op.getNode()->getValueType(0).getSizeInBits() == 128
- && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
- return Extract128BitVector(Vec, Idx, DAG, dl);
+ if (Op.getNode()->getValueType(0).is128BitVector() &&
+ Vec.getNode()->getValueType(0).is256BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Extract128BitVector(Vec, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7008,9 +7148,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue SubVec = Op.getNode()->getOperand(1);
SDValue Idx = Op.getNode()->getOperand(2);
- if (Op.getNode()->getValueType(0).getSizeInBits() == 256
- && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
- return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
+ if (Op.getNode()->getValueType(0).is256BitVector() &&
+ SubVec.getNode()->getValueType(0).is128BitVector() &&
+ isa<ConstantSDNode>(Idx)) {
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
}
}
return SDValue();
@@ -7219,7 +7361,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
- unsigned char OperandFlags) {
+ unsigned char OperandFlags, bool LocalDynamic = false) {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
DebugLoc dl = GA->getDebugLoc();
@@ -7227,12 +7369,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
GA->getValueType(0),
GA->getOffset(),
OperandFlags);
+
+ X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+ : X86ISD::TLSADDR;
+
if (InFlag) {
SDValue Ops[] = { Chain, TGA, *InFlag };
- Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
} else {
SDValue Ops[] = { Chain, TGA };
- Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+ Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7264,11 +7410,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
X86::RAX, X86II::MO_TLSGD);
}
-// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
-// "local exec" model.
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT,
+ bool is64Bit) {
+ DebugLoc dl = GA->getDebugLoc();
+
+ // Get the start address of the TLS block for this module.
+ X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+ .getInfo<X86MachineFunctionInfo>();
+ MFI->incNumLocalDynamicTLSAccesses();
+
+ SDValue Base;
+ if (is64Bit) {
+ Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+ X86II::MO_TLSLD, /*LocalDynamic=*/true);
+ } else {
+ SDValue InFlag;
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+ Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+ X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+ }
+
+ // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+ // of Base.
+
+ // Build x@dtpoff.
+ unsigned char OperandFlags = X86II::MO_DTPOFF;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+ GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ // Add x@dtpoff with the base.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
const EVT PtrVT, TLSModel::Model model,
- bool is64Bit) {
+ bool is64Bit, bool isPIC) {
DebugLoc dl = GA->getDebugLoc();
// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
@@ -7286,25 +7470,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
unsigned WrapperKind = X86ISD::Wrapper;
if (model == TLSModel::LocalExec) {
OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
- } else if (is64Bit) {
- assert(model == TLSModel::InitialExec);
- OperandFlags = X86II::MO_GOTTPOFF;
- WrapperKind = X86ISD::WrapperRIP;
+ } else if (model == TLSModel::InitialExec) {
+ if (is64Bit) {
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+ }
} else {
- assert(model == TLSModel::InitialExec);
- OperandFlags = X86II::MO_INDNTPOFF;
+ llvm_unreachable("Unexpected model");
}
- // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
- // exec)
+ // emit "addl x@ntpoff,%eax" (local exec)
+ // or "addl x@indntpoff,%eax" (initial exec)
+ // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
GA->getValueType(0),
GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
- if (model == TLSModel::InitialExec)
+ if (model == TLSModel::InitialExec) {
+ if (isPIC && !is64Bit) {
+ Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+ Offset);
+ }
+
Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, false, 0);
+ MachinePointerInfo::getGOT(), false, false, false,
+ 0);
+ }
// The address of the thread local variable is the add of the thread
// pointer with the offset of the variable.
@@ -7318,29 +7513,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
const GlobalValue *GV = GA->getGlobal();
if (Subtarget->isTargetELF()) {
- // TODO: implement the "local dynamic" model
- // TODO: implement the "initial exec"model for pic executables
-
- // If GV is an alias then use the aliasee for determining
- // thread-localness.
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->resolveAliasedGlobal(false);
-
TLSModel::Model model = getTargetMachine().getTLSModel(GV);
switch (model) {
case TLSModel::GeneralDynamic:
- case TLSModel::LocalDynamic: // not implemented
if (Subtarget->is64Bit())
return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+ Subtarget->is64Bit());
case TLSModel::InitialExec:
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
- Subtarget->is64Bit());
+ Subtarget->is64Bit(),
+ getTargetMachine().getRelocationModel() == Reloc::PIC_);
}
- } else if (Subtarget->isTargetDarwin()) {
+ llvm_unreachable("Unknown TLS model.");
+ }
+
+ if (Subtarget->isTargetDarwin()) {
// Darwin only has one model of TLS. Lower to that.
unsigned char OpFlag = 0;
unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
@@ -7383,7 +7575,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
Chain.getValue(1));
- } else if (Subtarget->isTargetWindows()) {
+ }
+
+ if (Subtarget->isTargetWindows()) {
// Just use the implicit TLS architecture
// Need to generate someting similar to:
// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7429,7 +7623,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
false, false, false, 0);
SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
- getPointerTy());
+ getPointerTy());
IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
@@ -7600,9 +7794,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
#ifdef __SSE3__
- haddpd %xmm0, %xmm0
+ haddpd %xmm0, %xmm0
#else
- pshufd $0x4e, %xmm0, %xmm1
+ pshufd $0x4e, %xmm0, %xmm1
addpd %xmm1, %xmm0
#endif
*/
@@ -7693,12 +7887,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
// Handle final rounding.
EVT DestVT = Op.getValueType();
- if (DestVT.bitsLT(MVT::f64)) {
+ if (DestVT.bitsLT(MVT::f64))
return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
DAG.getIntPtrConstant(0));
- } else if (DestVT.bitsGT(MVT::f64)) {
+ if (DestVT.bitsGT(MVT::f64))
return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
- }
// Handle final rounding.
return Sub;
@@ -7719,10 +7912,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
EVT DstVT = Op.getValueType();
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG);
- else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG);
- else if (Subtarget->is64Bit() &&
- SrcVT == MVT::i64 && DstVT == MVT::f32)
+ if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
@@ -7899,9 +8091,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
- else
- // The node is the result.
- return FIST;
+
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7916,9 +8108,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
FIST, StackSlot, MachinePointerInfo(),
false, false, false, 0);
- else
- // The node is the result.
- return FIST;
+
+ // The node is the result.
+ return FIST;
}
SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7931,7 +8123,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
EltVT = VT.getVectorElementType();
Constant *C;
if (EltVT == MVT::f64) {
- C = ConstantVector::getSplat(2,
+ C = ConstantVector::getSplat(2,
ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
} else {
C = ConstantVector::getSplat(4,
@@ -7965,15 +8157,15 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo::getConstantPool(),
false, false, false, 16);
if (VT.isVector()) {
- MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
+ MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::XOR, dl, XORVT,
- DAG.getNode(ISD::BITCAST, dl, XORVT,
- Op.getOperand(0)),
- DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
- } else {
- return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+ DAG.getNode(ISD::BITCAST, dl, XORVT,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
}
+
+ return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
}
SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -8172,7 +8364,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
// Otherwise use a regular EFLAGS-setting instruction.
switch (Op.getNode()->getOpcode()) {
default: llvm_unreachable("unexpected operator!");
- case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::SUB:
+ Opcode = X86ISD::SUB;
+ break;
case ISD::OR: Opcode = X86ISD::OR; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
@@ -8198,6 +8392,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, Op.getValueType()));
+ if (Opcode == X86ISD::CMP) {
+ SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
+ Op.getOperand(1));
+ // We can't replace usage of SUB with CMP.
+ // The SUB node will be removed later because there is no use of it.
+ return SDValue(New.getNode(), 0);
+ }
+
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0; i != NumOperands; ++i)
@@ -8217,9 +8419,41 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
return EmitTest(Op0, X86CC, DAG);
DebugLoc dl = Op0.getDebugLoc();
+ if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+ Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+ // Use SUB instead of CMP to enable CSE between SUB and CMP.
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+ Op0, Op1);
+ return SDValue(Sub.getNode(), 1);
+ }
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
}
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+ SelectionDAG &DAG) const {
+ // If the subtarget does not support the FUCOMI instruction, floating-point
+ // comparisons have to be converted.
+ if (Subtarget->hasCMov() ||
+ Cmp.getOpcode() != X86ISD::CMP ||
+ !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ return Cmp;
+
+ // The instruction selector will select an FUCOM instruction instead of
+ // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+ // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ DebugLoc dl = Cmp.getDebugLoc();
+ SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+ SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+ DAG.getConstant(8, MVT::i8));
+ SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+ return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
/// if it's possible.
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8341,6 +8575,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
}
@@ -8350,24 +8585,22 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
- assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+ assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
"Unsupported value type for operation");
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
DebugLoc dl = Op.getDebugLoc();
SDValue CC = Op.getOperand(2);
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
MVT EltVT = VT.getVectorElementType().getSimpleVT();
@@ -8389,10 +8622,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
if (isFP) {
- unsigned SSECC = 8;
+#ifndef NDEBUG
EVT EltVT = Op0.getValueType().getVectorElementType();
- assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+ unsigned SSECC;
bool Swap = false;
// SSE Condition code mapping:
@@ -8405,7 +8640,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
// 6 - NLE
// 7 - ORD
switch (SetCCOpcode) {
- default: break;
+ default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETOEQ:
case ISD::SETEQ: SSECC = 0; break;
case ISD::SETOGT:
@@ -8419,33 +8654,33 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETUO: SSECC = 3; break;
case ISD::SETUNE:
case ISD::SETNE: SSECC = 4; break;
- case ISD::SETULE: Swap = true;
+ case ISD::SETULE: Swap = true; // Fallthrough
case ISD::SETUGE: SSECC = 5; break;
- case ISD::SETULT: Swap = true;
+ case ISD::SETULT: Swap = true; // Fallthrough
case ISD::SETUGT: SSECC = 6; break;
case ISD::SETO: SSECC = 7; break;
+ case ISD::SETUEQ:
+ case ISD::SETONE: SSECC = 8; break;
}
if (Swap)
std::swap(Op0, Op1);
// In the two special cases we can't handle, emit two comparisons.
if (SSECC == 8) {
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
if (SetCCOpcode == ISD::SETUEQ) {
- SDValue UNORD, EQ;
- UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
- DAG.getConstant(3, MVT::i8));
- EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
- DAG.getConstant(0, MVT::i8));
- return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
- } else if (SetCCOpcode == ISD::SETONE) {
- SDValue ORD, NEQ;
- ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
- DAG.getConstant(7, MVT::i8));
- NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
- DAG.getConstant(4, MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+ CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+ } else {
+ assert(SetCCOpcode == ISD::SETONE);
+ CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
}
- llvm_unreachable("Illegal FP comparison");
+
+ SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(CC0, MVT::i8));
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+ DAG.getConstant(CC1, MVT::i8));
+ return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
}
// Handle all other FP comparisons here.
return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
@@ -8453,17 +8688,17 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
}
// Break 256-bit integer vector compare into smaller ones.
- if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+ if (VT.is256BitVector() && !Subtarget->hasAVX2())
return Lower256IntVSETCC(Op, DAG);
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
- unsigned Opc = 0;
+ unsigned Opc;
bool Swap = false, Invert = false, FlipSigns = false;
switch (SetCCOpcode) {
- default: break;
+ default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETNE: Invert = true;
case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
case ISD::SETLT: Swap = true;
@@ -8480,10 +8715,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
- if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
- return SDValue();
- if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
- return SDValue();
+ if (VT == MVT::v2i64) {
+ if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
+ return SDValue();
+ if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
+ return SDValue();
+ }
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
@@ -8510,7 +8747,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getNode()->getOpcode();
- if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+ Opc == X86ISD::SAHF)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD ||
@@ -8542,6 +8780,16 @@ static bool isAllOnes(SDValue V) {
return C && C->isAllOnesValue();
}
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+ if (V.getOpcode() != ISD::TRUNCATE)
+ return false;
+
+ SDValue VOp0 = V.getOperand(0);
+ unsigned InBits = VOp0.getValueSizeInBits();
+ unsigned Bits = V.getValueSizeInBits();
+ return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
bool addTest = true;
SDValue Cond = Op.getOperand(0);
@@ -8572,8 +8820,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
SDValue CmpOp0 = Cmp.getOperand(0);
+ // Apply further optimizations for special cases
+ // (select (x != 0), -1, 0) -> neg & sbb
+ // (select (x == 0), 0, -1) -> neg & sbb
+ if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+ if (YC->isNullValue() &&
+ (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+ DAG.getConstant(0, CmpOp0.getValueType()),
+ CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
+
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
SDValue Res = // Res = 0 or -1.
DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
@@ -8654,9 +8919,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (addTest) {
- // Look pass the truncate.
- if (Cond.getOpcode() == ISD::TRUNCATE)
- Cond = Cond.getOperand(0);
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
@@ -8679,7 +8944,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a < b ? 0 : -1 -> RES = setcc_carry
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
- if (Cond.getOpcode() == X86ISD::CMP) {
+ if (Cond.getOpcode() == X86ISD::SUB) {
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -8918,6 +9184,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
@@ -8947,6 +9214,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
+ Cmp = ConvertCmpIfNecessary(Cmp, DAG);
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
@@ -8960,9 +9228,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
}
if (addTest) {
- // Look pass the truncate.
- if (Cond.getOpcode() == ISD::TRUNCATE)
- Cond = Cond.getOperand(0);
+ // Look pass the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
// We know the result of AND is compared against zero. Try to match
// it to BT.
@@ -8980,6 +9248,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
CC = DAG.getConstant(X86::COND_NE, MVT::i8);
Cond = EmitTest(Cond, X86::COND_NE, DAG);
}
+ Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cond);
}
@@ -9018,7 +9287,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
const Function *F = MF.getFunction();
for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; I++)
+ I != E; ++I)
if (I->hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
@@ -9201,12 +9470,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
if (isa<ConstantSDNode>(ShAmt)) {
+ // Constant may be a TargetConstant. Use a regular constant.
+ uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
switch (Opc) {
default: llvm_unreachable("Unknown target vector shift node");
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
- return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, MVT::i32));
}
}
@@ -9223,10 +9495,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
SDValue ShOps[4];
ShOps[0] = ShAmt;
ShOps[1] = DAG.getConstant(0, MVT::i32);
- ShOps[2] = DAG.getUNDEF(MVT::i32);
- ShOps[3] = DAG.getUNDEF(MVT::i32);
+ ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
- ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
@@ -9261,8 +9538,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_sse2_ucomigt_sd:
case Intrinsic::x86_sse2_ucomige_sd:
case Intrinsic::x86_sse2_ucomineq_sd: {
- unsigned Opc = 0;
- ISD::CondCode CC = ISD::SETCC_INVALID;
+ unsigned Opc;
+ ISD::CondCode CC;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse_comieq_ss:
@@ -9336,245 +9613,102 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(X86CC, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
- // XOP comparison intrinsics
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq: {
- unsigned CC = 0;
- unsigned Opc = 0;
-
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- CC = 0;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- CC = 0;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- CC = 1;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- CC = 1;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- CC = 2;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- CC = 2;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- CC = 3;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- CC = 3;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- CC = 4;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- CC = 4;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- CC = 5;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- CC = 5;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- CC = 6;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- CC = 6;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- CC = 7;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq:
- CC = 7;
- Opc = X86ISD::VPCOMU;
- break;
- }
-
- SDValue LHS = Op.getOperand(1);
- SDValue RHS = Op.getOperand(2);
- return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
- DAG.getConstant(CC, MVT::i8));
- }
// Arithmetic intrinsics.
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_avx2_pmulu_dq:
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+
+ // SSE3/AVX horizontal add/sub intrinsics
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
case Intrinsic::x86_avx_hadd_ps_256:
case Intrinsic::x86_avx_hadd_pd_256:
- return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse3_hsub_ps:
case Intrinsic::x86_sse3_hsub_pd:
case Intrinsic::x86_avx_hsub_ps_256:
case Intrinsic::x86_avx_hsub_pd_256:
- return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_ssse3_phadd_w_128:
case Intrinsic::x86_ssse3_phadd_d_128:
case Intrinsic::x86_avx2_phadd_w:
case Intrinsic::x86_avx2_phadd_d:
- return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_ssse3_phsub_w_128:
case Intrinsic::x86_ssse3_phsub_d_128:
case Intrinsic::x86_avx2_phsub_w:
- case Intrinsic::x86_avx2_phsub_d:
- return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+ case Intrinsic::x86_avx2_phsub_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse3_hadd_ps:
+ case Intrinsic::x86_sse3_hadd_pd:
+ case Intrinsic::x86_avx_hadd_ps_256:
+ case Intrinsic::x86_avx_hadd_pd_256:
+ Opcode = X86ISD::FHADD;
+ break;
+ case Intrinsic::x86_sse3_hsub_ps:
+ case Intrinsic::x86_sse3_hsub_pd:
+ case Intrinsic::x86_avx_hsub_ps_256:
+ case Intrinsic::x86_avx_hsub_pd_256:
+ Opcode = X86ISD::FHSUB;
+ break;
+ case Intrinsic::x86_ssse3_phadd_w_128:
+ case Intrinsic::x86_ssse3_phadd_d_128:
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phadd_d:
+ Opcode = X86ISD::HADD;
+ break;
+ case Intrinsic::x86_ssse3_phsub_w_128:
+ case Intrinsic::x86_ssse3_phsub_d_128:
+ case Intrinsic::x86_avx2_phsub_w:
+ case Intrinsic::x86_avx2_phsub_d:
+ Opcode = X86ISD::HSUB;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // AVX2 variable shift intrinsics
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_q:
case Intrinsic::x86_avx2_psllv_d_256:
case Intrinsic::x86_avx2_psllv_q_256:
- return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_avx2_psrlv_d:
case Intrinsic::x86_avx2_psrlv_q:
case Intrinsic::x86_avx2_psrlv_d_256:
case Intrinsic::x86_avx2_psrlv_q_256:
- return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_avx2_psrav_d_256: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ Opcode = ISD::SHL;
+ break;
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ Opcode = ISD::SRL;
+ break;
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ Opcode = ISD::SRA;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
case Intrinsic::x86_ssse3_pshuf_b_128:
case Intrinsic::x86_avx2_pshuf_b:
return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::x86_ssse3_psign_b_128:
case Intrinsic::x86_ssse3_psign_w_128:
case Intrinsic::x86_ssse3_psign_d_128:
@@ -9583,15 +9717,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_avx2_psign_d:
return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+
case Intrinsic::x86_sse41_insertps:
return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
case Intrinsic::x86_avx_vperm2f128_ps_256:
case Intrinsic::x86_avx_vperm2f128_pd_256:
case Intrinsic::x86_avx_vperm2f128_si_256:
case Intrinsic::x86_avx2_vperm2i128:
return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
// Operands intentionally swapped. Mask is last operand to intrinsic,
@@ -9621,7 +9758,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_avx_vtestc_pd_256:
case Intrinsic::x86_avx_vtestnzc_pd_256: {
bool IsTestPacked = false;
- unsigned X86CC = 0;
+ unsigned X86CC;
switch (IntNo) {
default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
case Intrinsic::x86_avx_vtestz_ps:
@@ -9672,44 +9809,93 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_avx2_psll_w:
case Intrinsic::x86_avx2_psll_d:
case Intrinsic::x86_avx2_psll_q:
- return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse2_psrl_w:
case Intrinsic::x86_sse2_psrl_d:
case Intrinsic::x86_sse2_psrl_q:
case Intrinsic::x86_avx2_psrl_w:
case Intrinsic::x86_avx2_psrl_d:
case Intrinsic::x86_avx2_psrl_q:
- return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2));
case Intrinsic::x86_sse2_psra_w:
case Intrinsic::x86_sse2_psra_d:
case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+ case Intrinsic::x86_avx2_psra_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ Opcode = X86ISD::VSHL;
+ break;
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ Opcode = X86ISD::VSRL;
+ break;
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ Opcode = X86ISD::VSRA;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // SSE/AVX immediate shift intrinsics
case Intrinsic::x86_sse2_pslli_w:
case Intrinsic::x86_sse2_pslli_d:
case Intrinsic::x86_sse2_pslli_q:
case Intrinsic::x86_avx2_pslli_w:
case Intrinsic::x86_avx2_pslli_d:
case Intrinsic::x86_avx2_pslli_q:
- return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), DAG);
case Intrinsic::x86_sse2_psrli_w:
case Intrinsic::x86_sse2_psrli_d:
case Intrinsic::x86_sse2_psrli_q:
case Intrinsic::x86_avx2_psrli_w:
case Intrinsic::x86_avx2_psrli_d:
case Intrinsic::x86_avx2_psrli_q:
- return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), DAG);
case Intrinsic::x86_sse2_psrai_w:
case Intrinsic::x86_sse2_psrai_d:
case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+ case Intrinsic::x86_avx2_psrai_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ Opcode = X86ISD::VSHLI;
+ break;
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ Opcode = X86ISD::VSRLI;
+ break;
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ Opcode = X86ISD::VSRAI;
+ break;
+ }
+ return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
+ }
+
// Fix vector shift instructions where the last operand is a non-immediate
// i32 value.
case Intrinsic::x86_mmx_pslli_w:
@@ -9724,8 +9910,9 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
if (isa<ConstantSDNode>(ShAmt))
return SDValue();
- unsigned NewIntNo = 0;
+ unsigned NewIntNo;
switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_mmx_pslli_w:
NewIntNo = Intrinsic::x86_mmx_psll_w;
break;
@@ -9750,7 +9937,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::x86_mmx_psrai_d:
NewIntNo = Intrinsic::x86_mmx_psra_d;
break;
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
}
// The vector shift intrinsics with scalars uses 32b shift amounts but
@@ -9766,6 +9952,116 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(NewIntNo, MVT::i32),
Op.getOperand(1), ShAmt);
}
+ case Intrinsic::x86_sse42_pcmpistria128:
+ case Intrinsic::x86_sse42_pcmpestria128:
+ case Intrinsic::x86_sse42_pcmpistric128:
+ case Intrinsic::x86_sse42_pcmpestric128:
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ case Intrinsic::x86_sse42_pcmpistris128:
+ case Intrinsic::x86_sse42_pcmpestris128:
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ case Intrinsic::x86_sse42_pcmpestriz128: {
+ unsigned Opcode;
+ unsigned X86CC;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse42_pcmpistria128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpestria128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_A;
+ break;
+ case Intrinsic::x86_sse42_pcmpistric128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpestric128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse42_pcmpistrio128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpestrio128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_O;
+ break;
+ case Intrinsic::x86_sse42_pcmpistris128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpestris128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_S;
+ break;
+ case Intrinsic::x86_sse42_pcmpistriz128:
+ Opcode = X86ISD::PCMPISTRI;
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse42_pcmpestriz128:
+ Opcode = X86ISD::PCMPESTRI;
+ X86CC = X86::COND_E;
+ break;
+ }
+ SmallVector<SDValue, 5> NewOps;
+ NewOps.append(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8),
+ SDValue(PCMP.getNode(), 1));
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ case Intrinsic::x86_sse42_pcmpistri128:
+ case Intrinsic::x86_sse42_pcmpestri128: {
+ unsigned Opcode;
+ if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+ Opcode = X86ISD::PCMPISTRI;
+ else
+ Opcode = X86ISD::PCMPESTRI;
+
+ SmallVector<SDValue, 5> NewOps;
+ NewOps.append(Op->op_begin()+1, Op->op_end());
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+ }
+ }
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ // RDRAND intrinsics.
+ case Intrinsic::x86_rdrand_16:
+ case Intrinsic::x86_rdrand_32:
+ case Intrinsic::x86_rdrand_64: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+ // return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops, 4);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
}
}
@@ -9816,7 +10112,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
@@ -9833,7 +10128,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
- MF.getRegInfo().addLiveOut(StoreAddrReg);
return DAG.getNode(X86ISD::EH_RETURN, dl,
MVT::Other,
@@ -10149,23 +10443,21 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
- assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+ assert(VT.is256BitVector() && VT.isInteger() &&
"Unsupported value type for operation");
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
DebugLoc dl = Op.getDebugLoc();
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
// Extract the RHS vectors
SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
- SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+ SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10176,14 +10468,14 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
}
SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getValueType().getSizeInBits() == 256 &&
+ assert(Op.getValueType().is256BitVector() &&
Op.getValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
}
SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
- assert(Op.getValueType().getSizeInBits() == 256 &&
+ assert(Op.getValueType().is256BitVector() &&
Op.getValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
@@ -10193,7 +10485,7 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
- if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+ if (VT.is256BitVector() && !Subtarget->hasAVX2())
return Lower256IntArith(Op, DAG);
assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
@@ -10310,6 +10602,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
+ llvm_unreachable("Unknown shift opcode.");
}
if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
@@ -10353,6 +10646,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
}
+ llvm_unreachable("Unknown shift opcode.");
}
}
}
@@ -10421,15 +10715,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
}
// Decompose 256-bit shifts into smaller 128-bit shifts.
- if (VT.getSizeInBits() == 256) {
+ if (VT.is256BitVector()) {
unsigned NumElems = VT.getVectorNumElements();
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
// Extract the two vectors
- SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
- SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+ SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
// Recreate the shift amount vectors
SDValue Amt1, Amt2;
@@ -10448,9 +10741,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
&Amt2Csts[0], NumElems/2);
} else {
// Variable shift amount
- Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
- Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
+ Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+ Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
}
// Issue new vector shifts for the smaller types
@@ -10560,20 +10852,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
return SDValue();
if (!Subtarget->hasAVX2()) {
// needs to be split
- int NumElems = VT.getVectorNumElements();
- SDValue Idx0 = DAG.getConstant(0, MVT::i32);
- SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+ unsigned NumElems = VT.getVectorNumElements();
// Extract the LHS vectors
SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+ SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
MVT EltVT = VT.getVectorElementType().getSimpleVT();
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
EVT ExtraEltVT = ExtraVT.getVectorElementType();
- int ExtraNumElems = ExtraVT.getVectorNumElements();
+ unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
ExtraNumElems/2);
SDValue Extra = DAG.getValueType(ExtraVT);
@@ -10859,6 +11149,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
@@ -10913,9 +11204,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
Results.push_back(Swap.getValue(1));
}
-void X86TargetLowering::
+static void
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
- SelectionDAG &DAG, unsigned NewOp) const {
+ SelectionDAG &DAG, unsigned NewOp) {
DebugLoc dl = Node->getDebugLoc();
assert (Node->getValueType(0) == MVT::i64 &&
"Only know how to expand i64 atomics");
@@ -11013,7 +11304,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Regs64bit ? X86::RBX : X86::EBX,
swapInL, cpInH.getValue(1));
swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
- Regs64bit ? X86::RCX : X86::ECX,
+ Regs64bit ? X86::RCX : X86::ECX,
swapInH, swapInL.getValue(1));
SDValue Ops[] = { swapInH.getValue(0),
N->getOperand(1),
@@ -11036,26 +11327,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ATOMIC_LOAD_ADD:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
- return;
case ISD::ATOMIC_LOAD_AND:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
- return;
case ISD::ATOMIC_LOAD_NAND:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
- return;
case ISD::ATOMIC_LOAD_OR:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
- return;
case ISD::ATOMIC_LOAD_SUB:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
- return;
case ISD::ATOMIC_LOAD_XOR:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
- return;
- case ISD::ATOMIC_SWAP:
- ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+ case ISD::ATOMIC_SWAP: {
+ unsigned Opc;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::ATOMIC_LOAD_ADD:
+ Opc = X86ISD::ATOMADD64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_AND:
+ Opc = X86ISD::ATOMAND64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_NAND:
+ Opc = X86ISD::ATOMNAND64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_OR:
+ Opc = X86ISD::ATOMOR64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_SUB:
+ Opc = X86ISD::ATOMSUB64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_XOR:
+ Opc = X86ISD::ATOMXOR64_DAG;
+ break;
+ case ISD::ATOMIC_SWAP:
+ Opc = X86ISD::ATOMSWAP64_DAG;
+ break;
+ }
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
return;
+ }
case ISD::ATOMIC_LOAD:
ReplaceATOMIC_LOAD(N, Results, DAG);
}
@@ -11118,10 +11423,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
@@ -11131,7 +11438,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
@@ -11190,6 +11499,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
+ case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
}
}
@@ -11258,6 +11575,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
return true;
}
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return Imm == (int32_t)Imm;
+}
+
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
@@ -11300,8 +11626,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
isMOVLMask(M, VT) ||
isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
isPSHUFDMask(M, VT) ||
- isPSHUFHWMask(M, VT) ||
- isPSHUFLWMask(M, VT) ||
+ isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
+ isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
isPALIGNRMask(M, VT, Subtarget) ||
isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
@@ -11316,7 +11642,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
// FIXME: This collection of masks seems suspect.
if (NumElts == 2)
return true;
- if (NumElts == 4 && VT.getSizeInBits() == 128) {
+ if (NumElts == 4 && VT.is128BitVector()) {
return (isMOVLMask(Mask, VT) ||
isCommutedMOVLMask(Mask, VT, true) ||
isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
@@ -11460,7 +11786,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
// result in out1, out2
// fallthrough -->nextMBB
- const TargetRegisterClass *RC = X86::GR32RegisterClass;
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
const unsigned LoadOpc = X86::MOV32rm;
const unsigned NotOpc = X86::NOT32r;
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11662,7 +11988,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
int valArgIndx = lastAddrIndx + 1;
- unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
for (int i=0; i <= lastAddrIndx; ++i)
(*MIB).addOperand(*argOpers[i]);
@@ -11672,7 +11998,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
argOpers[valArgIndx]->isImm()) &&
"invalid operand");
- unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
if (argOpers[valArgIndx]->isReg())
MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
else
@@ -11687,7 +12013,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
MIB.addReg(t2);
// Generate movc
- unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
MIB.addReg(t2);
MIB.addReg(t1);
@@ -11742,8 +12068,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
MIB.addOperand(Op);
}
BuildMI(*BB, MI, dl,
- TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
- MI->getOperand(0).getReg())
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
.addReg(X86::XMM0);
MI->eraseFromParent();
@@ -11776,24 +12101,6 @@ X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
}
MachineBasicBlock *
-X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
- DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- // First arg in ECX, the second in EAX.
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
- .addReg(MI->getOperand(0).getReg());
- BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
- .addReg(MI->getOperand(1).getReg());
-
- // The instruction doesn't actually take any operands though.
- BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
-
- MI->eraseFromParent(); // The pseudo is gone now.
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(
MachineInstr *MI,
MachineBasicBlock *MBB) const {
@@ -12306,8 +12613,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+ .addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
@@ -12517,7 +12825,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// Load the old value of the high byte of the control word...
unsigned OldCW =
- F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+ F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
CWFrameIdx);
@@ -12582,22 +12890,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
// String/text processing lowering.
case X86::PCMPISTRM128REG:
case X86::VPCMPISTRM128REG:
- return EmitPCMP(MI, BB, 3, false /* in-mem */);
case X86::PCMPISTRM128MEM:
case X86::VPCMPISTRM128MEM:
- return EmitPCMP(MI, BB, 3, true /* in-mem */);
case X86::PCMPESTRM128REG:
case X86::VPCMPESTRM128REG:
- return EmitPCMP(MI, BB, 5, false /* in mem */);
case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM:
- return EmitPCMP(MI, BB, 5, true /* in mem */);
+ case X86::VPCMPESTRM128MEM: {
+ unsigned NumArgs;
+ bool MemArg;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG:
+ case X86::VPCMPISTRM128REG:
+ NumArgs = 3; MemArg = false; break;
+ case X86::PCMPISTRM128MEM:
+ case X86::VPCMPISTRM128MEM:
+ NumArgs = 3; MemArg = true; break;
+ case X86::PCMPESTRM128REG:
+ case X86::VPCMPESTRM128REG:
+ NumArgs = 5; MemArg = false; break;
+ case X86::PCMPESTRM128MEM:
+ case X86::VPCMPESTRM128MEM:
+ NumArgs = 5; MemArg = true; break;
+ }
+ return EmitPCMP(MI, BB, NumArgs, MemArg);
+ }
// Thread synchronization.
case X86::MONITOR:
return EmitMonitor(MI, BB);
- case X86::MWAIT:
- return EmitMwait(MI, BB);
// Atomic Lowering.
case X86::ATOMAND32:
@@ -12605,25 +12926,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMOR32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
X86::OR32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMXOR32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
X86::XOR32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass);
+ &X86::GR32RegClass);
case X86::ATOMNAND32:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
X86::AND32ri, X86::MOV32rm,
X86::LCMPXCHG32,
X86::NOT32r, X86::EAX,
- X86::GR32RegisterClass, true);
+ &X86::GR32RegClass, true);
case X86::ATOMMIN32:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
case X86::ATOMMAX32:
@@ -12638,25 +12959,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMOR16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
X86::OR16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMXOR16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
X86::XOR16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass);
+ &X86::GR16RegClass);
case X86::ATOMNAND16:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
X86::AND16ri, X86::MOV16rm,
X86::LCMPXCHG16,
X86::NOT16r, X86::AX,
- X86::GR16RegisterClass, true);
+ &X86::GR16RegClass, true);
case X86::ATOMMIN16:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
case X86::ATOMMAX16:
@@ -12671,25 +12992,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMOR8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
X86::OR8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMXOR8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
X86::XOR8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass);
+ &X86::GR8RegClass);
case X86::ATOMNAND8:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
X86::AND8ri, X86::MOV8rm,
X86::LCMPXCHG8,
X86::NOT8r, X86::AL,
- X86::GR8RegisterClass, true);
+ &X86::GR8RegClass, true);
// FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
// This group is for 64-bit host.
case X86::ATOMAND64:
@@ -12697,25 +13018,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
X86::AND64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMOR64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
X86::OR64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMXOR64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
X86::XOR64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass);
+ &X86::GR64RegClass);
case X86::ATOMNAND64:
return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
X86::AND64ri32, X86::MOV64rm,
X86::LCMPXCHG64,
X86::NOT64r, X86::RAX,
- X86::GR64RegisterClass, true);
+ &X86::GR64RegClass, true);
case X86::ATOMMIN64:
return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
case X86::ATOMMAX64:
@@ -12870,10 +13191,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
/// inserting the result into the low part of a new 256-bit vector
static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+ for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
SVOp->getMaskElt(j) >= 0)
return false;
@@ -12886,10 +13207,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
/// inserting the result into the high part of a new 256-bit vector
static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+ for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
SVOp->getMaskElt(j) >= 0)
return false;
@@ -12906,7 +13227,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
EVT VT = SVOp->getValueType(0);
- int NumElems = VT.getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -12931,30 +13252,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// To match the shuffle mask, the first half of the mask should
// be exactly the first vector, and all the rest a splat with the
// first element of the second one.
- for (int i = 0; i < NumElems/2; ++i)
+ for (unsigned i = 0; i != NumElems/2; ++i)
if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
!isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
return SDValue();
// If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
- SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
- SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
- Ld->getMemoryVT(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- false/*isVolatile*/, true/*ReadMem*/,
- false/*WriteMem*/);
- return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
- }
+ if (Ld->hasNUsesOfValue(1, 0)) {
+ SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+ SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+ Ld->getMemoryVT(),
+ Ld->getPointerInfo(),
+ Ld->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
+ return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+ }
+ }
// Emit a zeroed vector and insert the desired subvector on its
// first half.
SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
- DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}
@@ -12964,18 +13286,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
// vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
if (isShuffleHigh128VectorInsertLow(SVOp)) {
- SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
- DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
- V, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
return DCI.CombineTo(N, InsV);
}
// vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
if (isShuffleLow128VectorInsertHigh(SVOp)) {
- SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
- V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
return DCI.CombineTo(N, InsV);
}
@@ -12995,12 +13314,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
// Combine 256-bit vector shuffles. This is only profitable when in AVX mode
- if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+ if (Subtarget->hasAVX() && VT.is256BitVector() &&
N->getOpcode() == ISD::VECTOR_SHUFFLE)
return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
// Only handle 128 wide vector from here on.
- if (VT.getSizeInBits() != 128)
+ if (!VT.is128BitVector())
return SDValue();
// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
@@ -13014,16 +13333,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
}
-/// PerformTruncateCombine - Converts truncate operation to
+/// DCI, PerformTruncateCombine - Converts truncate operation to
/// a sequence of vector shuffle operations.
/// It is possible when we truncate 256-bit vector to 128-bit vector
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
DAGCombinerInfo &DCI) const {
if (!DCI.isBeforeLegalizeOps())
return SDValue();
- if (!Subtarget->hasAVX()) return SDValue();
+ if (!Subtarget->hasAVX())
+ return SDValue();
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
@@ -13032,55 +13352,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+ if (Subtarget->hasAVX2()) {
+ // AVX2: v4i64 -> v4i32
+
+ // VPERMD
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+ Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // AVX: v4i64 -> v4i32
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(2));
+ DAG.getIntPtrConstant(2));
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
// PSHUFD
- int ShufMask1[] = {0, 2, 0, 0};
+ static const int ShufMask1[] = {0, 2, 0, 0};
- OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
- ShufMask1);
- OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
- ShufMask1);
+ SDValue Undef = DAG.getUNDEF(VT);
+ OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
// MOVLHPS
- int ShufMask2[] = {0, 1, 4, 5};
+ static const int ShufMask2[] = {0, 1, 4, 5};
return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
}
+
if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+ if (Subtarget->hasAVX2()) {
+ // AVX2: v8i32 -> v8i16
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+
+ // PSHUFB
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
+ &pshufbMask[0], 32);
+ Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+ Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64),
+ &ShufMask[0]);
+
+ Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+ DAG.getIntPtrConstant(0));
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+ }
+
SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(0));
+ DAG.getIntPtrConstant(0));
SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(4));
+ DAG.getIntPtrConstant(4));
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
// PSHUFB
- int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
- -1, -1, -1, -1, -1, -1, -1, -1};
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
- OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
- DAG.getUNDEF(MVT::v16i8),
- ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
- DAG.getUNDEF(MVT::v16i8),
- ShufMask1);
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
// MOVLHPS
- int ShufMask2[] = {0, 1, 4, 5};
+ static const int ShufMask2[] = {0, 1, 4, 5};
SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
@@ -13127,7 +13494,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SmallVector<int, 16> ShuffleMask;
bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+ if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+ UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
@@ -13276,8 +13644,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
-
-
DebugLoc DL = N->getDebugLoc();
SDValue Cond = N->getOperand(0);
// Get the LHS/RHS of the select.
@@ -13559,9 +13925,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// to simplify previous instructions.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
- !DCI.isBeforeLegalize() &&
- TLI.isOperationLegal(ISD::VSELECT, VT)) {
+ !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+ // Don't optimize vector selects that map to mask-registers.
+ if (BitWidth == 1)
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
@@ -13576,6 +13946,88 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+ // Quit if not CMP and SUB with its value result used.
+ if (Cmp.getOpcode() != X86ISD::CMP &&
+ (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+ return SDValue();
+
+ // Quit if not used as a boolean value.
+ if (CC != X86::COND_E && CC != X86::COND_NE)
+ return SDValue();
+
+ // Check CMP operands. One of them should be 0 or 1 and the other should be
+ // an SetCC or extended from it.
+ SDValue Op1 = Cmp.getOperand(0);
+ SDValue Op2 = Cmp.getOperand(1);
+
+ SDValue SetCC;
+ const ConstantSDNode* C = 0;
+ bool needOppositeCond = (CC == X86::COND_E);
+
+ if ((C = dyn_cast<ConstantSDNode>(Op1)))
+ SetCC = Op2;
+ else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+ SetCC = Op1;
+ else // Quit if all operands are not constants.
+ return SDValue();
+
+ if (C->getZExtValue() == 1)
+ needOppositeCond = !needOppositeCond;
+ else if (C->getZExtValue() != 0)
+ // Quit if the constant is neither 0 or 1.
+ return SDValue();
+
+ // Skip 'zext' node.
+ if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
+ SetCC = SetCC.getOperand(0);
+
+ // Quit if not SETCC.
+ // FIXME: So far we only handle the boolean value generated from SETCC. If
+ // there is other ways to generate boolean values, we need handle them here
+ // as well.
+ if (SetCC.getOpcode() != X86ISD::SETCC)
+ return SDValue();
+
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+
+ return SetCC.getOperand(1);
+}
+
+static bool IsValidFCMOVCondition(X86::CondCode CC) {
+ switch (CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_AE:
+ case X86::COND_A:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
@@ -13589,6 +14041,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
SDValue TrueOp = N->getOperand(1);
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);
+
if (CC == X86::COND_E || CC == X86::COND_NE) {
switch (Cond.getOpcode()) {
default: break;
@@ -13600,6 +14053,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ SDValue Flags;
+
+ Flags = BoolTestSetCCCombine(Cond, CC);
+ if (Flags.getNode() &&
+ // Extra check as FCMOV only supports a subset of X86 cond.
+ (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+ SDValue Ops[] = { FalseOp, TrueOp,
+ DAG.getConstant(CC, MVT::i8), Flags };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+ Ops, array_lengthof(Ops));
+ }
+
// If this is a select between two integer constants, try to do some
// optimizations. Note that the operands are ordered the opposite of SELECT
// operands.
@@ -14022,7 +14487,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
// Sometimes the operand may come from a insert_subvector building a 256-bit
// allones vector
- if (VT.getSizeInBits() == 256 &&
+ if (VT.is256BitVector() &&
N->getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
@@ -14260,6 +14725,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ if (VT.isInteger() && VT.getSizeInBits() == 8)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+ // and change it to SUB and CMOV.
+ if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+ N0.getOpcode() == ISD::ADD &&
+ N0.getOperand(1) == N1 &&
+ N1.getOpcode() == ISD::SRA &&
+ N1.getOperand(0) == N0.getOperand(0))
+ if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+ if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+ // Generate SUB & CMOV.
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, VT), N0.getOperand(0));
+
+ SDValue Ops[] = { N0.getOperand(0), Neg,
+ DAG.getConstant(X86::COND_GE, MVT::i8),
+ SDValue(Neg.getNode(), 1) };
+ return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+ Ops, array_lengthof(Ops));
+ }
+ return SDValue();
+}
+
// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -14267,6 +14767,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (Subtarget->hasCMov()) {
+ SDValue RV = performIntegerAbsCombine(N, DAG);
+ if (RV.getNode())
+ return RV;
+ }
+
+ // Try forming BMI if it is available.
+ if (!Subtarget->hasBMI())
+ return SDValue();
+
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
@@ -14292,7 +14802,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
@@ -14314,63 +14825,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
- // All sizes must be a power of two
- if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
- // Attempt to load the original value using a single load op.
- // Find a scalar type which is equal to the loaded word size.
+ // All sizes must be a power of two.
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+ return SDValue();
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
- break;
}
}
- // Proceed if a load word is found.
- if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
RegSz/SclrLoadTy.getSizeInBits());
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
RegSz/MemVT.getScalarType().getSizeInBits());
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
- // Perform a single load.
- SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
- // Insert the word loaded into a vector.
- SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- LoadUnitVecVT, ScalarLoad);
+ // We can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
+
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ptr, Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
- SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
- ScalarInVector);
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio] = i;
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(SlicedVec.getValueType()),
- ShuffleVec.data());
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
// Bitcast to the requested type.
Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
// Replace the original load with the new sequence
// and return the new chain.
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
- return SDValue(ScalarLoad.getNode(), 1);
+ return DCI.CombineTo(N, Shuff, TF, true);
}
return SDValue();
@@ -14387,13 +14929,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we are saving a concatenation of two XMM registers, perform two stores.
- // This is better in Sandy Bridge cause one 256-bit mem op is done via two
- // 128-bit ones. If in the future the cost becomes only one memory access the
- // first version would be better.
- if (VT.getSizeInBits() == 256 &&
- StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
- StoredVal.getNumOperands() == 2) {
-
+ // On Sandy Bridge, 256-bit memory operations are executed by two
+ // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+ // memory operation.
+ if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
+ StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+ StoredVal.getNumOperands() == 2) {
SDValue Value0 = StoredVal.getOperand(0);
SDValue Value1 = StoredVal.getOperand(1);
@@ -14438,14 +14979,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
- // Can't shuffle using an illegal type
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVec.getValueType()),
- ShuffleVec.data());
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
// At this point all of the data is stored at the bottom of the
// register. We now need to save it to mem.
@@ -14454,13 +14997,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
// Bitcast the original vector into a vector of store-size units
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
@@ -14469,7 +15017,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Ptr = St->getBasePtr();
// Perform one or more big stores into memory.
- for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+ for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
StoreType, ShuffWide,
DAG.getIntPtrConstant(i));
@@ -14818,18 +15366,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps())
return SDValue();
- if (!Subtarget->hasAVX())
+ if (!Subtarget->hasAVX())
return SDValue();
- // Optimize vectors in AVX mode
- // Sign extend v8i16 to v8i32 and
- // v4i32 to v4i64
- //
- // Divide input vector into two parts
- // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
- // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
- // concat the vectors to original VT
-
EVT VT = N->getValueType(0);
SDValue Op = N->getOperand(0);
EVT OpVT = Op.getValueType();
@@ -14838,23 +15377,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
(VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
unsigned NumElems = OpVT.getVectorNumElements();
+ SDValue Undef = DAG.getUNDEF(OpVT);
+
SmallVector<int,8> ShufMask1(NumElems, -1);
- for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
- SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
- ShufMask1.data());
+ SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
SmallVector<int,8> ShufMask2(NumElems, -1);
- for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
- SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
- ShufMask2.data());
+ SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
VT.getVectorNumElements()/2);
- OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+ OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -14862,7 +15415,42 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ bool NegA = (A.getOpcode() == ISD::FNEG);
+ bool NegB = (B.getOpcode() == ISD::FNEG);
+ bool NegC = (C.getOpcode() == ISD::FNEG);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+ if (NegA)
+ A = A.getOperand(0);
+ if (NegB)
+ B = B.getOperand(0);
+ if (NegC)
+ C = C.getOperand(0);
+
+ unsigned Opcode;
+ if (!NegMul)
+ Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+ return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
// (and (i32 x86isd::setcc_carry), 1)
@@ -14887,6 +15475,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
N00.getOperand(0), N00.getOperand(1)),
DAG.getConstant(1, VT));
}
+
// Optimize vectors in AVX mode:
//
// v8i16 -> v8i32
@@ -14899,50 +15488,139 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
//
- if (Subtarget->hasAVX()) {
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
+ if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) {
- SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
- SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
- SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
- EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements()/2);
+ SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+ SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
+ SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
- OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
- }
+ OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+ return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+// x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ LHS.getValueType(), RHS, LHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
+ if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+ if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+ SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+ RHS.getValueType(), LHS, RHS.getOperand(1));
+ return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+ addV, DAG.getConstant(0, addV.getValueType()), CC);
+ }
return SDValue();
}
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
- unsigned X86CC = N->getConstantOperandVal(0);
- SDValue EFLAG = N->getOperand(1);
DebugLoc DL = N->getDebugLoc();
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+ SDValue EFLAGS = N->getOperand(1);
// Materialize "setb reg" as "sbb reg,reg", since it can be extended without
// a zext and produces an all-ones bit which is more useful than 0/1 in some
// cases.
- if (X86CC == X86::COND_B)
+ if (CC == X86::COND_B)
return DAG.getNode(ISD::AND, DL, MVT::i8,
DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8), EFLAG),
+ DAG.getConstant(CC, MVT::i8), EFLAGS),
DAG.getConstant(1, MVT::i8));
+ SDValue Flags;
+
+ Flags = BoolTestSetCCCombine(EFLAGS, CC);
+ if (Flags.getNode()) {
+ SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+ }
+
+ return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest = N->getOperand(1);
+ SDValue EFLAGS = N->getOperand(3);
+ X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+ SDValue Flags;
+
+ Flags = BoolTestSetCCCombine(EFLAGS, CC);
+ if (Flags.getNode()) {
+ SDValue Cond = DAG.getConstant(CC, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+ Flags);
+ }
+
+ return SDValue();
+}
+
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = N->getOperand(0);
+ EVT InVT = Op0->getValueType(0);
+
+ // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
+ if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+ // Notice that we use SINT_TO_FP because we know that the high bits
+ // are zero and SINT_TO_FP is better supported by the hardware.
+ return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+ }
+
return SDValue();
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
const X86TargetLowering *XTLI) {
SDValue Op0 = N->getOperand(0);
+ EVT InVT = Op0->getValueType(0);
+
+ // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+ if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+ }
+
// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
// a 32-bit target where SSE doesn't support i64->FP operations.
if (Op0.getOpcode() == ISD::LOAD) {
@@ -14961,6 +15639,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+
+ // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
+ if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+ DebugLoc dl = N->getDebugLoc();
+ MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+ SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
+ }
+
+ return SDValue();
+}
+
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15095,9 +15787,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
- case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
@@ -15105,10 +15799,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
- case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
+ case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
+ case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGN:
case X86ISD::UNPCKH:
@@ -15123,6 +15820,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
}
return SDValue();
@@ -15652,55 +16350,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// in the normal allocation?
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget->is64Bit()) {
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, X86::GR32RegisterClass);
- else if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16RegisterClass);
- else if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8RegisterClass);
- else if (VT == MVT::i64 || VT == MVT::f64)
- return std::make_pair(0U, X86::GR64RegisterClass);
- break;
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8RegClass);
+ if (VT == MVT::i64 || VT == MVT::f64)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
}
// 32-bit fallthrough
case 'Q': // Q_REGS
if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
- else if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
- else if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
- else if (VT == MVT::i64)
- return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i8 || VT == MVT::i1)
+ return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+ if (VT == MVT::i64)
+ return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
case 'l': // INDEX_REGS
if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8RegisterClass);
+ return std::make_pair(0U, &X86::GR8RegClass);
if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16RegisterClass);
+ return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
- return std::make_pair(0U, X86::GR32RegisterClass);
- return std::make_pair(0U, X86::GR64RegisterClass);
+ return std::make_pair(0U, &X86::GR32RegClass);
+ return std::make_pair(0U, &X86::GR64RegClass);
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
- return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
- return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR16_NOREXRegClass);
if (VT == MVT::i32 || !Subtarget->is64Bit())
- return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
- return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+ return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
- return std::make_pair(0U, X86::RFP32RegisterClass);
+ return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
- return std::make_pair(0U, X86::RFP64RegisterClass);
- return std::make_pair(0U, X86::RFP80RegisterClass);
+ return std::make_pair(0U, &X86::RFP64RegClass);
+ return std::make_pair(0U, &X86::RFP80RegClass);
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget->hasMMX()) break;
- return std::make_pair(0U, X86::VR64RegisterClass);
+ return std::make_pair(0U, &X86::VR64RegClass);
case 'Y': // SSE_REGS if SSE2 allowed
if (!Subtarget->hasSSE2()) break;
// FALL THROUGH.
@@ -15712,10 +16410,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// Scalar SSE types.
case MVT::f32:
case MVT::i32:
- return std::make_pair(0U, X86::FR32RegisterClass);
+ return std::make_pair(0U, &X86::FR32RegClass);
case MVT::f64:
case MVT::i64:
- return std::make_pair(0U, X86::FR64RegisterClass);
+ return std::make_pair(0U, &X86::FR64RegClass);
// Vector types.
case MVT::v16i8:
case MVT::v8i16:
@@ -15723,7 +16421,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v2i64:
case MVT::v4f32:
case MVT::v2f64:
- return std::make_pair(0U, X86::VR128RegisterClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
// AVX types.
case MVT::v32i8:
case MVT::v16i16:
@@ -15731,8 +16429,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
case MVT::v4i64:
case MVT::v8f32:
case MVT::v4f64:
- return std::make_pair(0U, X86::VR256RegisterClass);
-
+ return std::make_pair(0U, &X86::VR256RegClass);
}
break;
}
@@ -15755,28 +16452,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Constraint[6] == '}') {
Res.first = X86::ST0+Constraint[4]-'0';
- Res.second = X86::RFP80RegisterClass;
+ Res.second = &X86::RFP80RegClass;
return Res;
}
// GCC allows "st(0)" to be called just plain "st".
if (StringRef("{st}").equals_lower(Constraint)) {
Res.first = X86::ST0;
- Res.second = X86::RFP80RegisterClass;
+ Res.second = &X86::RFP80RegClass;
return Res;
}
// flags -> EFLAGS
if (StringRef("{flags}").equals_lower(Constraint)) {
Res.first = X86::EFLAGS;
- Res.second = X86::CCRRegisterClass;
+ Res.second = &X86::CCRRegClass;
return Res;
}
// 'A' means EAX + EDX.
if (Constraint == "A") {
Res.first = X86::EAX;
- Res.second = X86::GR32_ADRegisterClass;
+ Res.second = &X86::GR32_ADRegClass;
return Res;
}
return Res;
@@ -15792,7 +16489,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
// really want an 8-bit or 32-bit register, map to the appropriate register
// class and return the appropriate register.
- if (Res.second == X86::GR16RegisterClass) {
+ if (Res.second == &X86::GR16RegClass) {
if (VT == MVT::i8) {
unsigned DestReg = 0;
switch (Res.first) {
@@ -15804,7 +16501,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR8RegisterClass;
+ Res.second = &X86::GR8RegClass;
}
} else if (VT == MVT::i32) {
unsigned DestReg = 0;
@@ -15821,7 +16518,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR32RegisterClass;
+ Res.second = &X86::GR32RegClass;
}
} else if (VT == MVT::i64) {
unsigned DestReg = 0;
@@ -15838,22 +16535,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
}
if (DestReg) {
Res.first = DestReg;
- Res.second = X86::GR64RegisterClass;
+ Res.second = &X86::GR64RegClass;
}
}
- } else if (Res.second == X86::FR32RegisterClass ||
- Res.second == X86::FR64RegisterClass ||
- Res.second == X86::VR128RegisterClass) {
+ } else if (Res.second == &X86::FR32RegClass ||
+ Res.second == &X86::FR64RegClass ||
+ Res.second == &X86::VR128RegClass) {
// Handle references to XMM physical registers that got mapped into the
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
- if (VT == MVT::f32)
- Res.second = X86::FR32RegisterClass;
- else if (VT == MVT::f64)
- Res.second = X86::FR64RegisterClass;
- else if (X86::VR128RegisterClass->hasType(VT))
- Res.second = X86::VR128RegisterClass;
+
+ if (VT == MVT::f32 || VT == MVT::i32)
+ Res.second = &X86::FR32RegClass;
+ else if (VT == MVT::f64 || VT == MVT::i64)
+ Res.second = &X86::FR64RegClass;
+ else if (X86::VR128RegClass.hasType(VT))
+ Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
}
return Res;
OpenPOWER on IntegriCloud