diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86ISelLowering.cpp | 3094 |
1 files changed, 1897 insertions, 1197 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 04299f3..4af12e4 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49,6 +49,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <bitset> +#include <cctype> using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -62,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, /// simple subregister reference. Idx is an index in the 128 bits we /// want. It need not be aligned to a 128-bit bounday. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. -static SDValue Extract128BitVector(SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl) { +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, DebugLoc dl) { EVT VT = Vec.getValueType(); - assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); + assert(VT.is256BitVector() && "Unexpected vector size!"); EVT ElVT = VT.getVectorElementType(); - int Factor = VT.getSizeInBits()/128; + unsigned Factor = VT.getSizeInBits()/128; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) - return DAG.getNode(ISD::UNDEF, dl, ResultVT); - - if (isa<ConstantSDNode>(Idx)) { - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return DAG.getUNDEF(ResultVT); - // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR - // we can match to VEXTRACTF128. - unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR + // we can match to VEXTRACTF128. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); - // This is the index of the first element of the 128-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) - * ElemsPerChunk); + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); - SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, - VecIdx); - - return Result; - } + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, + VecIdx); - return SDValue(); + return Result; } /// Generate a DAG to put 128-bits into a vector > 128 bits. This @@ -104,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec, /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit bounday. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, - SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, DebugLoc dl) { - if (isa<ConstantSDNode>(Idx)) { - EVT VT = Vec.getValueType(); - assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; - EVT ElVT = VT.getVectorElementType(); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - EVT ResultVT = Result.getValueType(); + EVT VT = Vec.getValueType(); + assert(VT.is128BitVector() && "Unexpected vector size!"); - // Insert the relevant 128 bits. - unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); - // This is the index of the first element of the 128-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) - * ElemsPerChunk); + // Insert the relevant 128 bits. + unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); - Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, - VecIdx); - return Result; - } + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + * ElemsPerChunk); - return SDValue(); + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, + VecIdx); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + DebugLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { @@ -140,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { if (Subtarget->isTargetEnvMacho()) { if (is64Bit) - return new X8664_MachoTargetObjectFile(); + return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); } + if (Subtarget->isTargetLinux()) + return new X86LinuxTargetObjectFile(); if (Subtarget->isTargetELF()) return new TargetLoweringObjectFileELF(); if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) @@ -162,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) TD = getTargetData(); // Set up the TargetLowering object. - static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; + static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); @@ -171,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. - // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. - if (Subtarget->is64Bit()) + // For Atom, always use ILP scheduling. + if (Subtarget->isAtom()) + setSchedulingPreference(Sched::ILP); + else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); - else if (Subtarget->isAtom()) - setSchedulingPreference(Sched::Hybrid); else setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -215,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Set up the register classes. - addRegisterClass(MVT::i8, X86::GR8RegisterClass); - addRegisterClass(MVT::i16, X86::GR16RegisterClass); - addRegisterClass(MVT::i32, X86::GR32RegisterClass); + addRegisterClass(MVT::i8, &X86::GR8RegClass); + addRegisterClass(MVT::i16, &X86::GR16RegClass); + addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) - addRegisterClass(MVT::i64, X86::GR64RegisterClass); + addRegisterClass(MVT::i64, &X86::GR64RegClass); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); @@ -345,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (unsigned i = 0, e = 4; i != e; ++i) { + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); @@ -492,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setShouldFoldAtomicFences(true); // Expand certain atomics - for (unsigned i = 0, e = 4; i != e; ++i) { + for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); @@ -567,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. - addRegisterClass(MVT::f32, X86::FR32RegisterClass); - addRegisterClass(MVT::f64, X86::FR64RegisterClass); + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); @@ -599,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. - addRegisterClass(MVT::f32, X86::FR32RegisterClass); - addRegisterClass(MVT::f64, X86::RFP64RegisterClass); + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); @@ -632,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. - addRegisterClass(MVT::f64, X86::RFP64RegisterClass); - addRegisterClass(MVT::f32, X86::RFP32RegisterClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); @@ -641,7 +643,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f32 , Expand); setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f32 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 @@ -660,7 +664,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Long double always uses X87. if (!TM.Options.UseSoftFloat) { - addRegisterClass(MVT::f80, X86::RFP80RegisterClass); + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { @@ -705,8 +709,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + for (int VT = MVT::FIRST_VECTOR_VALUETYPE; + VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) { setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); @@ -729,6 +733,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); @@ -764,8 +769,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction((MVT::SimpleValueType)VT, (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); @@ -776,7 +781,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { - addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); + addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -813,7 +818,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { - addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); + addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); @@ -826,18 +831,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { - addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); + addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM // registers cannot be used even for integer operations. - addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); - addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); - addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); - addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); + addRegisterClass(MVT::v16i8, &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); @@ -867,27 +871,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); - // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { - EVT VT = (MVT::SimpleValueType)i; + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to custom lower non-power-of-2 vectors if (!isPowerOf2_32(VT.getVectorNumElements())) continue; // Do not attempt to custom lower non-128-bit vectors if (!VT.is128BitVector()) continue; - setOperationAction(ISD::BUILD_VECTOR, - VT.getSimpleVT().SimpleTy, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, - VT.getSimpleVT().SimpleTy, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, - VT.getSimpleVT().SimpleTy, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); @@ -903,24 +898,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-128-bit vectors if (!VT.is128BitVector()) continue; - setOperationAction(ISD::AND, SVT, Promote); - AddPromotedToType (ISD::AND, SVT, MVT::v2i64); - setOperationAction(ISD::OR, SVT, Promote); - AddPromotedToType (ISD::OR, SVT, MVT::v2i64); - setOperationAction(ISD::XOR, SVT, Promote); - AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); - setOperationAction(ISD::LOAD, SVT, Promote); - AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); - setOperationAction(ISD::SELECT, SVT, Promote); - AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v2i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v2i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } setTruncStoreAction(MVT::f64, MVT::f32, Expand); @@ -1007,16 +1001,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE42()) - setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { - addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); - addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); - addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); - addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); - addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); - addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); + addRegisterClass(MVT::v32i8, &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); @@ -1040,13 +1031,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); - setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1070,6 +1054,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + if (Subtarget->hasFMA()) { + setOperationAction(ISD::FMA, MVT::v8f32, Custom); + setOperationAction(ISD::FMA, MVT::v4f64, Custom); + setOperationAction(ISD::FMA, MVT::v4f32, Custom); + setOperationAction(ISD::FMA, MVT::v2f64, Custom); + setOperationAction(ISD::FMA, MVT::f32, Custom); + setOperationAction(ISD::FMA, MVT::f64, Custom); + } + if (Subtarget->hasAVX2()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); @@ -1121,60 +1114,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Custom lower several nodes for 256-bit types. - for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) - setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; - setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { - MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; - EVT VT = SVT; + for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { + MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-256-bit vectors if (!VT.is256BitVector()) continue; - setOperationAction(ISD::AND, SVT, Promote); - AddPromotedToType (ISD::AND, SVT, MVT::v4i64); - setOperationAction(ISD::OR, SVT, Promote); - AddPromotedToType (ISD::OR, SVT, MVT::v4i64); - setOperationAction(ISD::XOR, SVT, Promote); - AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); - setOperationAction(ISD::LOAD, SVT, Promote); - AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); - setOperationAction(ISD::SELECT, SVT, Promote); - AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v4i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v4i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v4i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { + for (int VT = MVT::FIRST_VECTOR_VALUETYPE; + VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't @@ -1218,17 +1211,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); - if (Subtarget->hasBMI()) - setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); @@ -1243,6 +1240,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; + // Predictable cmov don't hurt on atom because it's in-order. + predictableSelectIsExpensive = !Subtarget->isAtom(); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1276,7 +1276,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { break; } } - return; } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate @@ -1411,18 +1410,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = (Subtarget->is64Bit() - ? X86::GR64RegisterClass : X86::GR32RegisterClass); + RRC = Subtarget->is64Bit() ? + (const TargetRegisterClass*)&X86::GR64RegClass : + (const TargetRegisterClass*)&X86::GR32RegClass; break; case MVT::x86mmx: - RRC = X86::VR64RegisterClass; + RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: - RRC = X86::VR128RegisterClass; + RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); @@ -1457,7 +1457,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, + MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; @@ -1501,6 +1501,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); + // Promote values to the appropriate types + if (VA.getLocInfo() == CCValAssign::SExt) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); + // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || @@ -1638,7 +1648,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -1655,7 +1665,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, SDValue Val; // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the the value is popped from the stack, so + // point stack, we must guarantee the value is popped from the stack, so // a CopyFromReg is not good enough - the copy instruction may be eliminated // if the return value is not used. We use the FpPOP_RETVAL instruction // instead. @@ -1699,21 +1709,37 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. -static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { +enum StructReturnType { + NotStructReturn, + RegStructReturn, + StackStructReturn +}; +static StructReturnType +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { if (Outs.empty()) - return false; + return NotStructReturn; - return Outs[0].Flags.isSRet(); + const ISD::ArgFlagsTy &Flags = Outs[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg()) + return RegStructReturn; + return StackStructReturn; } /// ArgsAreStructReturn - Determines whether a function uses struct /// return semantics. -static bool -ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { +static StructReturnType +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { if (Ins.empty()) - return false; + return NotStructReturn; - return Ins[0].Flags.isSRet(); + const ISD::ArgFlagsTy &Flags = Ins[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg()) + return RegStructReturn; + return StackStructReturn; } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified @@ -1850,19 +1876,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) - RC = X86::GR32RegisterClass; + RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) - RC = X86::GR64RegisterClass; + RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) - RC = X86::FR32RegisterClass; + RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) - RC = X86::FR64RegisterClass; - else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) - RC = X86::VR256RegisterClass; - else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) - RC = X86::VR128RegisterClass; + RC = &X86::FR64RegClass; + else if (RegVT.is256BitVector()) + RC = &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) - RC = X86::VR64RegisterClass; + RC = &X86::VR64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2004,7 +2030,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - X86::GR64RegisterClass); + &X86::GR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, @@ -2020,7 +2046,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<SDValue, 11> SaveXMMOps; SaveXMMOps.push_back(Chain); - unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); SaveXMMOps.push_back(ALVal); @@ -2031,7 +2057,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - X86::VR128RegisterClass); + &X86::VR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); SaveXMMOps.push_back(Val); } @@ -2054,7 +2080,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && - ArgsAreStructReturn(Ins)) + argsAreStructReturn(Ins) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2127,19 +2153,24 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, } SDValue -X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, - CallingConv::ID CallConv, bool isVarArg, - bool doesNotRet, bool &isTailCall, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - DebugLoc dl, SelectionDAG &DAG, +X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + DebugLoc &dl = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool &isTailCall = CLI.IsTailCall; + bool isVarArg = CLI.IsVarArg; + MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); bool IsWindows = Subtarget->isTargetWindows(); - bool IsStructRet = CallIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; if (MF.getTarget().Options.DisableTailCalls) @@ -2148,8 +2179,9 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, - isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), - Outs, OutVals, Ins, DAG); + isVarArg, SR != NotStructReturn, + MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. @@ -2231,7 +2263,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: - if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { + if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); @@ -2282,27 +2314,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); - // Build a sequence of copy-to-reg nodes chained together with token chain - // and flag operands which copy the outgoing args into registers. - SDValue InFlag; - // Tail call byval lowering might overwrite argument registers so in case of - // tail call optimization the copies to registers are lowered later. - if (!isTailCall) - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { - Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), getPointerTy()), - InFlag); - InFlag = Chain.getValue(1); + RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of @@ -2340,12 +2357,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); - Chain = DAG.getCopyToReg(Chain, dl, X86::AL, - DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); - InFlag = Chain.getValue(1); + RegsToPass.push_back(std::make_pair(unsigned(X86::AL), + DAG.getConstant(NumXMMRegs, MVT::i8))); } - // For tail calls lower the arguments to the 'real' stack slot. if (isTailCall) { // Force all the incoming stack arguments to be loaded from the stack @@ -2359,8 +2374,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - // Do not flag preceding copytoreg stuff together with the following stuff. - InFlag = SDValue(); if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -2400,19 +2413,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains2[0], MemOpChains2.size()); - // Copy arguments to their registers. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - InFlag =SDValue(); - // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, FPDiff, dl); } + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + if (getTargetMachine().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls @@ -2514,14 +2528,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); - // Add an implicit use GOT pointer in EBX. - if (!isTailCall && Subtarget->isPICStyleGOT()) - Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); - - // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. - if (Is64Bit && isVarArg && !IsWin64) - Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); - // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); @@ -2551,7 +2557,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && - IsStructRet) + SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. @@ -2743,7 +2749,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -2764,7 +2770,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (Unused) { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -2778,12 +2784,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + getTargetMachine(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + getTargetMachine(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -2810,7 +2816,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) { @@ -2872,8 +2878,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } FastISel * -X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { - return X86::createFastISel(funcInfo); +X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return X86::createFastISel(funcInfo, libInfo); } @@ -2911,6 +2918,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::UNPCKH: case X86ISD::VPERMILP: case X86ISD::VPERM2X128: + case X86ISD::VPERMI: return true; } } @@ -3051,10 +3059,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_NS; - } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + } + if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; - } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + } + if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_LE; @@ -3170,12 +3180,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return false; } -/// isSequentialOrUndefInRange - Return true if every element in Mask, begining +/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, - int Pos, int Size, int Low) { - for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) + unsigned Pos, unsigned Size, int Low) { + for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; @@ -3194,8 +3204,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { - if (VT != MVT::v8i16) +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) return false; // Lower quadword copied in order or undef. @@ -3204,16 +3214,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { // Upper quadword shuffled. for (unsigned i = 4; i != 8; ++i) - if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) + if (!isUndefOrInRange(Mask[i], 4, 8)) return false; + if (VT == MVT::v16i16) { + // Lower quadword copied in order or undef. + if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) + return false; + + // Upper quadword shuffled. + for (unsigned i = 12; i != 16; ++i) + if (!isUndefOrInRange(Mask[i], 12, 16)) + return false; + } + return true; } /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { - if (VT != MVT::v8i16) +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16)) return false; // Upper quadword copied in order. @@ -3222,9 +3243,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { // Lower quadword shuffled. for (unsigned i = 0; i != 4; ++i) - if (Mask[i] >= 4) + if (!isUndefOrInRange(Mask[i], 0, 4)) + return false; + + if (VT == MVT::v16i16) { + // Upper quadword copied in order. + if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) return false; + // Lower quadword shuffled. + for (unsigned i = 8; i != 12; ++i) + if (!isUndefOrInRange(Mask[i], 8, 12)) + return false; + } + return true; } @@ -3374,11 +3406,11 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems != 4) return false; @@ -3393,11 +3425,11 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems != 4) return false; @@ -3410,7 +3442,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -3418,11 +3450,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i != NumElems/2; ++i) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; - for (unsigned i = NumElems/2; i != NumElems; ++i) + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; @@ -3432,23 +3464,71 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { + if (!VT.is128BitVector()) + return false; + unsigned NumElems = VT.getVectorNumElements(); - if ((NumElems != 2 && NumElems != 4) - || VT.getSizeInBits() > 128) + if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i != NumElems/2; ++i) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = 0; i != NumElems/2; ++i) - if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask[i + e], i + NumElems)) return false; return true; } +// +// Some special combinations that can be optimized. +// +static +SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + DebugLoc dl = SVOp->getDebugLoc(); + + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return SDValue(); + + ArrayRef<int> Mask = SVOp->getMask(); + + // These are the special masks that may be optimized. + static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; + static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; + bool MatchEvenMask = true; + bool MatchOddMask = true; + for (int i=0; i<8; ++i) { + if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) + MatchEvenMask = false; + if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) + MatchOddMask = false; + } + static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; + static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; + + const int *CompactionMask; + if (MatchEvenMask) + CompactionMask = CompactionMaskEven; + else if (MatchOddMask) + CompactionMask = CompactionMaskOdd; + else + return SDValue(); + + SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); + + SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), + UndefNode, CompactionMask); + SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), + UndefNode, CompactionMask); + static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); +} + /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, @@ -3606,7 +3686,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); @@ -3628,7 +3708,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { - if (!HasAVX || VT.getSizeInBits() != 256) + if (!HasAVX || !VT.is256BitVector()) return false; // The shuffle result is divided into half A and half B. In total the two @@ -3720,9 +3800,10 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { /// element of vector 2 and the other elements to come from vector 1 in order. static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { - unsigned NumOps = VT.getVectorNumElements(); - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return false; + + unsigned NumOps = VT.getVectorNumElements(); if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; @@ -3788,9 +3869,11 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { - unsigned NumElts = VT.getVectorNumElements(); + if (!HasAVX || !VT.is256BitVector()) + return false; - if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts != 4) return false; for (unsigned i = 0; i != NumElts/2; ++i) @@ -3806,7 +3889,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; unsigned e = VT.getVectorNumElements() / 2; @@ -3880,9 +3963,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); if (Elt < 0) continue; - Elt %= NumLaneElts; - unsigned ShAmt = i << Shift; - if (ShAmt >= 8) ShAmt -= 8; + Elt &= NumLaneElts - 1; + unsigned ShAmt = (i << Shift) % 8; Mask |= Elt << ShAmt; } @@ -3892,30 +3974,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned Mask = 0; - // 8 nodes, but we only care about the last 4. - for (unsigned i = 7; i >= 4; --i) { - int Val = N->getMaskElt(i); - if (Val >= 0) - Mask |= (Val - 4); - if (i != 4) - Mask <<= 2; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the last 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i+4); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits. + Mask |= Elt << (i * 2); + } } + return Mask; } /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT == MVT::v8i16 || VT == MVT::v16i16) && + "Unsupported vector type for PSHUFHW"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned Mask = 0; - // 8 nodes, but we only care about the first 4. - for (int i = 3; i >= 0; --i) { - int Val = N->getMaskElt(i); - if (Val >= 0) - Mask |= Val; - if (i != 0) - Mask <<= 2; + for (unsigned l = 0; l != NumElts; l += 8) { + // 8 nodes per lane, but we only care about the first 4. + for (unsigned i = 0; i < 4; ++i) { + int Elt = N->getMaskElt(l+i); + if (Elt < 0) continue; + Elt &= 0x3; // only 2-bits + Mask |= Elt << (i * 2); + } } + return Mask; } @@ -4016,13 +4116,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SmallVector<int, 8> MaskVec; for (unsigned i = 0; i != NumElems; ++i) { - int idx = SVOp->getMaskElt(i); - if (idx < 0) - MaskVec.push_back(idx); - else if (idx < (int)NumElems) - MaskVec.push_back(idx + NumElems); - else - MaskVec.push_back(idx - NumElems); + int Idx = SVOp->getMaskElt(i); + if (Idx >= 0) { + if (Idx < (int)NumElems) + Idx += NumElems; + else + Idx -= NumElems; + } + MaskVec.push_back(Idx); } return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), SVOp->getOperand(0), &MaskVec[0]); @@ -4033,7 +4134,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) return false; @@ -4090,7 +4191,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) { /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ArrayRef<int> Mask, EVT VT) { - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return false; if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) @@ -4107,7 +4208,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = NumElems/2; i != NumElems; ++i) + for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; @@ -4159,11 +4260,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); + unsigned Size = VT.getSizeInBits(); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; - if (VT.getSizeInBits() == 128) { // SSE + if (Size == 128) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); @@ -4171,7 +4273,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } - } else if (VT.getSizeInBits() == 256) { // AVX + } else if (Size == 256) { // AVX if (Subtarget->hasAVX2()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; @@ -4183,7 +4285,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } - } + } else + llvm_unreachable("Unexpected vector type"); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -4194,25 +4298,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); - assert((VT.is128BitVector() || VT.is256BitVector()) - && "Expected a 128-bit or 256-bit vector type"); + unsigned Size = VT.getSizeInBits(); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; - if (VT.getSizeInBits() == 256) { + if (Size == 256) { if (HasAVX2) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } - } else { + } else if (Size == 128) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - } + } else + llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -4255,9 +4356,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); - unsigned Half = NumElems/2; SmallVector<int, 8> Mask; - for (unsigned i = 0; i != Half; ++i) { + for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } @@ -4289,15 +4389,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { EVT VT = V.getValueType(); DebugLoc dl = V.getDebugLoc(); - assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) - && "Vector size not supported"); + unsigned Size = VT.getSizeInBits(); - if (VT.getSizeInBits() == 128) { + if (Size == 128) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), &SplatMask[0]); - } else { + } else if (Size == 256) { // To use VPERMILPS to splat scalars, the second half of indicies must // refer to the higher part, which is a duplication of the lower one, // because VPERMILPS can only handle in-lane permutations. @@ -4307,7 +4406,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), &SplatMask[0]); - } + } else + llvm_unreachable("Vector size not supported"); return DAG.getNode(ISD::BITCAST, dl, VT, V); } @@ -4328,9 +4428,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { - unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; - V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); - if (Idx > 0) + V1 = Extract128BitVector(V1, EltNo, DAG, dl); + if (EltNo >= NumElems/2) EltNo -= NumElems/2; } @@ -4346,10 +4445,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // into the low and high part. This is necessary because we want // to use VPERM* to shuffle the vectors if (Size == 256) { - SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - V1 = Insert128BitVector(InsV, V1, - DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); } return getLegalSplat(DAG, V1, EltNo); @@ -4377,7 +4473,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the /// target specific opcode. Returns true if the Mask could be calculated. /// Sets IsUnary to true if only uses one source. -static bool getTargetShuffleMask(SDNode *N, EVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4408,12 +4504,17 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT, break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::VPERMI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: @@ -4473,20 +4574,21 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - unsigned NumElems = VT.getVectorNumElements(); + MVT ShufVT = V.getValueType().getSimpleVT(); + unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; SDValue ImmN; bool IsUnary; - if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt < 0) - return DAG.getUNDEF(VT.getVectorElementType()); + return DAG.getUNDEF(ShufVT.getVectorElementType()); SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -4631,7 +4733,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. - if (SVOp->getValueType(0).getSizeInBits() > 128) + if (!SVOp->getValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || @@ -4726,7 +4828,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, DebugLoc dl) { - assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); + assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); @@ -4794,7 +4896,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, @@ -4802,7 +4904,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, false, false, false, 0); SmallVector<int, 8> Mask; - for (int i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) Mask.push_back(EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); @@ -4866,8 +4968,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); - } else if (NumElems == 4 && LastLoadedElt == 1 && - DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { + } + if (NumElems == 4 && LastLoadedElt == 1 && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = @@ -4896,6 +4999,9 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for broadcast."); + SDValue Ld; bool ConstSplatVal; @@ -4930,8 +5036,17 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); SDValue Sc = Op.getOperand(0); - if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) - return SDValue(); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasAVX2()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.is256BitVector()) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || @@ -4946,8 +5061,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { } } - bool Is256 = VT.getSizeInBits() == 256; - bool Is128 = VT.getSizeInBits() == 128; + bool Is256 = VT.is256BitVector(); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector @@ -4957,9 +5071,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); - if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || - (Is128 && (ScalarSize == 32))) { - + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) { const Constant *C = 0; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -4971,40 +5083,32 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { SDValue CP = DAG.getConstantPool(C, getPointerTy()); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } - // The scalar source must be a normal load. - if (!ISD::isNormalLoad(Ld.getNode())) - return SDValue(); - - // Reject loads that have uses of the chain result - if (Ld->hasAnyUseOfValue(1)) - return SDValue(); - + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); unsigned ScalarSize = Ld.getValueType().getSizeInBits(); - // VBroadcast to YMM - if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) + // Handle AVX2 in-register broadcasts. + if (!IsLoad && Subtarget->hasAVX2() && + (ScalarSize == 32 || (Is256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); - // VBroadcast to XMM - if (Is128 && (ScalarSize == 32)) + // The scalar source must be a normal load. + if (!IsLoad) + return SDValue(); + + if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match - // double since there is vbroadcastsd xmm + // double since there is no vbroadcastsd xmm if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { - // VBroadcast to YMM - if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); - - // VBroadcast to XMM - if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) + if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5012,6 +5116,82 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); } +// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64 +// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the +// constraint of matching input/output vector elements. +SDValue +X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + SDNode *N = Op.getNode(); + EVT VT = Op.getValueType(); + unsigned NumElts = Op.getNumOperands(); + + // Check supported types and sub-targets. + // + // Only v2f32 -> v2f64 needs special handling. + if (VT != MVT::v2f64 || !Subtarget->hasSSE2()) + return SDValue(); + + SDValue VecIn; + EVT VecInVT; + SmallVector<int, 8> Mask; + EVT SrcVT = MVT::Other; + + // Check the patterns could be translated into X86vfpext. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue In = N->getOperand(i); + unsigned Opcode = In.getOpcode(); + + // Skip if the element is undefined. + if (Opcode == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + // Quit if one of the elements is not defined from 'fpext'. + if (Opcode != ISD::FP_EXTEND) + return SDValue(); + + // Check how the source of 'fpext' is defined. + SDValue L2In = In.getOperand(0); + EVT L2InVT = L2In.getValueType(); + + // Check the original type + if (SrcVT == MVT::Other) + SrcVT = L2InVT; + else if (SrcVT != L2InVT) // Quit if non-homogenous typed. + return SDValue(); + + // Check whether the value being 'fpext'ed is extracted from the same + // source. + Opcode = L2In.getOpcode(); + + // Quit if it's not extracted with a constant index. + if (Opcode != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(L2In.getOperand(1))) + return SDValue(); + + SDValue ExtractedFromVec = L2In.getOperand(0); + + if (VecIn.getNode() == 0) { + VecIn = ExtractedFromVec; + VecInVT = ExtractedFromVec.getValueType(); + } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec. + return SDValue(); + + Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue()); + } + + // Fill the remaining mask as undef. + for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i) + Mask.push_back(-1); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getVectorShuffle(VecInVT, DL, + VecIn, DAG.getUNDEF(VecInVT), + &Mask[0])); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5044,6 +5224,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (Broadcast.getNode()) return Broadcast; + SDValue FpExt = LowerVectorFpExtend(Op, DAG); + if (FpExt.getNode()) + return FpExt; + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5102,8 +5286,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Mask.push_back(Idx); for (unsigned i = 1; i != VecElts; ++i) Mask.push_back(i); - Item = DAG.getVectorShuffle(VecVT, dl, Item, - DAG.getUNDEF(Item.getValueType()), + Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), &Mask[0]); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); @@ -5120,12 +5303,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); @@ -5134,12 +5317,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); - Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), - DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } else { - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); @@ -5171,7 +5353,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i < NumElems; i++) + for (unsigned i = 0; i != NumElems; ++i) MaskVec.push_back(i == Idx ? 0 : 1); return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); } @@ -5199,7 +5381,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { SmallVector<SDValue, 32> V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); @@ -5212,10 +5394,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { NumElems/2); // Recreate the wider vector with the lower and upper part. - SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -5283,7 +5462,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } - if (Values.size() > 1 && VT.getSizeInBits() == 128) { + if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); @@ -5344,62 +5523,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place -// them in a MMX register. This is better than doing a stack convert. -static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { - DebugLoc dl = Op.getDebugLoc(); - EVT ResVT = Op.getValueType(); - - assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || - ResVT == MVT::v8i16 || ResVT == MVT::v16i8); - int Mask[2]; - SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); - SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); - InVec = Op.getOperand(1); - if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { - unsigned NumElts = ResVT.getVectorNumElements(); - VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); - VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, - InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); - } else { - InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); - SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); - Mask[0] = 0; Mask[1] = 2; - VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); - } - return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); -} - // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); EVT ResVT = Op.getValueType(); - assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); + assert(ResVT.is256BitVector() && "Value type must be 256-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } SDValue X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { - EVT ResVT = Op.getValueType(); - assert(Op.getNumOperands() == 2); - assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && - "Unsupported CONCAT_VECTORS for value type"); - - // We support concatenate two MMX registers and place them in a MMX register. - // This is better than doing a stack convert. - if (ResVT.is128BitVector()) - return LowerMMXCONCAT_VECTORS(Op, DAG); // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. @@ -5407,75 +5548,64 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { } // Try to lower a shuffle node into a simple blend instruction. -static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, +static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = Op.getValueType(); - EVT InVT = V1.getValueType(); - int MaskSize = VT.getVectorNumElements(); - int InSize = InVT.getVectorNumElements(); + MVT VT = SVOp->getValueType(0).getSimpleVT(); + unsigned NumElems = VT.getVectorNumElements(); if (!Subtarget->hasSSE41()) return SDValue(); - if (MaskSize != InSize) - return SDValue(); - - int ISDNo = 0; + unsigned ISDNo = 0; MVT OpTy; - switch (VT.getSimpleVT().SimpleTy) { + switch (VT.SimpleTy) { default: return SDValue(); case MVT::v8i16: - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v8i16; - break; + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; case MVT::v4i32: case MVT::v4f32: - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v4f32; - break; + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; case MVT::v2i64: case MVT::v2f64: - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v2f64; - break; + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; case MVT::v8i32: case MVT::v8f32: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPS; - OpTy = MVT::v8f32; - break; + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; case MVT::v4i64: case MVT::v4f64: - if (!Subtarget->hasAVX()) - return SDValue(); - ISDNo = X86ISD::BLENDPD; - OpTy = MVT::v4f64; - break; - case MVT::v16i16: - if (!Subtarget->hasAVX2()) - return SDValue(); - ISDNo = X86ISD::BLENDPW; - OpTy = MVT::v16i16; - break; + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; } assert(ISDNo && "Invalid Op Number"); unsigned MaskVals = 0; - for (int i = 0; i < MaskSize; ++i) { + for (unsigned i = 0; i != NumElems; ++i) { int EltIdx = SVOp->getMaskElt(i); - if (EltIdx == i || EltIdx == -1) + if (EltIdx == (int)i || EltIdx < 0) MaskVals |= (1<<i); - else if (EltIdx == (i + MaskSize)) + else if (EltIdx == (int)(i + NumElems)) continue; // Bit is set to zero; - else return SDValue(); + else + return SDValue(); } V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); @@ -5629,13 +5759,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, bool TwoInputs = V1Used && V2Used; for (unsigned i = 0; i != 8; ++i) { int EltIdx = MaskVals[i] * 2; - if (TwoInputs && (EltIdx >= 16)) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); + int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx; + int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); } V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, @@ -5649,13 +5776,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.clear(); for (unsigned i = 0; i != 8; ++i) { int EltIdx = MaskVals[i] * 2; - if (EltIdx < 16) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); + int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16; + int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15; + pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8)); } V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, @@ -5731,10 +5855,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, int EltIdx = MaskVals[i]; if (EltIdx < 0) continue; - SDValue ExtOp = (EltIdx < 8) - ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, - DAG.getIntPtrConstant(EltIdx)) - : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + SDValue ExtOp = (EltIdx < 8) ? + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, DAG.getIntPtrConstant(EltIdx - 8)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, DAG.getIntPtrConstant(i)); @@ -5755,21 +5879,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. - // FIXME: kill V2Only once shuffles are canonizalized by getNode. - bool V1Only = true; - bool V2Only = true; - for (unsigned i = 0; i < 16; ++i) { - int EltIdx = MaskVals[i]; - if (EltIdx < 0) - continue; - if (EltIdx < 16) - V2Only = false; - else - V1Only = false; - } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. if (TLI.getSubtarget()->hasSSSE3()) { @@ -5781,23 +5895,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // Otherwise, we have elements from both input vectors, and must zero out // elements that come from V2 in the first mask, and V1 in the second mask // so that we can OR them together. - bool TwoInputs = !(V1Only || V2Only); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; - if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } + if (EltIdx < 0 || EltIdx >= 16) + EltIdx = 0x80; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } - // If all the elements are from V2, assign it to V1 and return after - // building the first pshufb. - if (V2Only) - V1 = V2; V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (!TwoInputs) + if (V2IsUndef) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5805,11 +5912,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, pshufbMask.clear(); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; - if (EltIdx < 16) { - pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); - continue; - } - pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, @@ -5822,7 +5926,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // the 16 different words that comprise the two doublequadword input vectors. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); - SDValue NewV = V2Only ? V2 : V1; + SDValue NewV = V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; int Elt1 = MaskVals[i*2+1]; @@ -5832,9 +5936,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, continue; // This word of the result is already in the correct place, skip it. - if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) - continue; - if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) + if ((Elt0 == i*2) && (Elt1 == i*2+1)) continue; SDValue Elt0Src = Elt0 < 16 ? V1 : V2; @@ -5896,41 +5998,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, DebugLoc dl) { - EVT VT = SVOp->getValueType(0); - SDValue V1 = SVOp->getOperand(0); - SDValue V2 = SVOp->getOperand(1); + MVT VT = SVOp->getValueType(0).getSimpleVT(); unsigned NumElems = VT.getVectorNumElements(); - unsigned NewWidth = (NumElems == 4) ? 2 : 4; - EVT NewVT; - switch (VT.getSimpleVT().SimpleTy) { + MVT NewVT; + unsigned Scale; + switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); - case MVT::v4f32: NewVT = MVT::v2f64; break; - case MVT::v4i32: NewVT = MVT::v2i64; break; - case MVT::v8i16: NewVT = MVT::v4i32; break; - case MVT::v16i8: NewVT = MVT::v4i32; break; + case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; + case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; + case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; + case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; + case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; + case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; } - int Scale = NumElems / NewWidth; SmallVector<int, 8> MaskVec; - for (unsigned i = 0; i < NumElems; i += Scale) { + for (unsigned i = 0; i != NumElems; i += Scale) { int StartIdx = -1; - for (int j = 0; j < Scale; ++j) { + for (unsigned j = 0; j != Scale; ++j) { int EltIdx = SVOp->getMaskElt(i+j); if (EltIdx < 0) continue; - if (StartIdx == -1) - StartIdx = EltIdx - (EltIdx % Scale); - if (EltIdx != StartIdx + j) + if (StartIdx < 0) + StartIdx = (EltIdx / Scale); + if (EltIdx != (int)(StartIdx*Scale + j)) return SDValue(); } - if (StartIdx == -1) - MaskVec.push_back(-1); - else - MaskVec.push_back(StartIdx / Scale); + MaskVec.push_back(StartIdx); } - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); + SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } @@ -5973,6 +6071,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + + SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); + if (NewOp.getNode()) + return NewOp; + EVT VT = SVOp->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); @@ -5981,14 +6084,15 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { DebugLoc dl = SVOp->getDebugLoc(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); - SDValue Shufs[2]; + SDValue Output[2]; SmallVector<int, 16> Mask; for (unsigned l = 0; l < 2; ++l) { // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands (recorded in InputUsed). // If building a suitable shuffle vector proves too hard, then bail - // out with useBuildVector set. + // out with UseBuildVector set. + bool UseBuildVector = false; int InputUsed[2] = { -1, -1 }; // Not yet discovered. unsigned LaneStart = l * NumLaneElems; for (unsigned i = 0; i != NumLaneElems; ++i) { @@ -6020,38 +6124,61 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } if (OpNo >= array_lengthof(InputUsed)) { - // More than two input vectors used! Give up. - return SDValue(); + // More than two input vectors used! Give up on trying to create a + // shuffle vector. Insert all elements into a BUILD_VECTOR instead. + UseBuildVector = true; + break; } // Add the mask index for the new shuffle vector. Mask.push_back(Idx + OpNo * NumLaneElems); } - if (InputUsed[0] < 0) { + if (UseBuildVector) { + SmallVector<SDValue, 16> SVOps; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + SVOps.push_back(DAG.getUNDEF(EltVT)); + continue; + } + + // The input vector this mask element indexes into. + int Input = Idx / NumElems; + + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumElems; + + // Extract the vector element by hand. + SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, + SVOp->getOperand(Input), + DAG.getIntPtrConstant(Idx))); + } + + // Construct the output using a BUILD_VECTOR. + Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0], + SVOps.size()); + } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. - Shufs[l] = DAG.getUNDEF(NVT); + Output[l] = DAG.getUNDEF(NVT); } else { SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), - DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32), - DAG, dl); + (InputUsed[0] % 2) * NumLaneElems, + DAG, dl); // If only one input was used, use an undefined vector for the other. SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), - DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32), - DAG, dl); + (InputUsed[1] % 2) * NumLaneElems, DAG, dl); // At least one input vector was used. Create a new shuffle vector. - Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); + Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); } Mask.clear(); } // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0], - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32), - DAG, dl); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with @@ -6063,7 +6190,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { DebugLoc dl = SVOp->getDebugLoc(); EVT VT = SVOp->getValueType(0); - assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); + assert(VT.is128BitVector() && "Unsupported vector size"); std::pair<int, int> Locs[4]; int Mask1[] = { -1, -1, -1, -1 }; @@ -6107,7 +6234,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); - } else if (NumLo == 3 || NumHi == 3) { + } + + if (NumLo == 3 || NumHi == 3) { // Otherwise, we must have three elements from one vector, call it X, and // one element from the other, call it Y. First, use a shufps to build an // intermediate vector with the one element from Y and the element from X @@ -6143,17 +6272,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { Mask1[2] = HiIndex & 1 ? 6 : 4; Mask1[3] = HiIndex & 1 ? 4 : 6; return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - } else { - Mask1[0] = HiIndex & 1 ? 2 : 0; - Mask1[1] = HiIndex & 1 ? 0 : 2; - Mask1[2] = PermMask[2]; - Mask1[3] = PermMask[3]; - if (Mask1[2] >= 0) - Mask1[2] += 4; - if (Mask1[3] >= 0) - Mask1[3] += 4; - return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); } + + Mask1[0] = HiIndex & 1 ? 2 : 0; + Mask1[1] = HiIndex & 1 ? 0 : 2; + Mask1[2] = PermMask[2]; + Mask1[3] = PermMask[3]; + if (Mask1[2] >= 0) + Mask1[2] += 4; + if (Mask1[3] >= 0) + Mask1[3] += 4; + return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); } // Break it into (shuffle shuffle_hi, shuffle_lo). @@ -6302,7 +6431,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) - // If we don't care about the second element, procede to use movss. + // If we don't care about the second element, proceed to use movss. if (SVOp->getMaskElt(1) != -1) return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } @@ -6360,7 +6489,8 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! - if (VT == MVT::v8i16 || VT == MVT::v16i8) { + if (VT == MVT::v8i16 || VT == MVT::v16i8 || + VT == MVT::v16i16 || VT == MVT::v32i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); @@ -6564,11 +6694,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // new vector_shuffle with the corrected mask.p SmallVector<int, 8> NewMask(M.begin(), M.end()); NormalizeMask(NewMask, NumElems); - if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { + if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { + if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); - } } if (Commuted) { @@ -6605,12 +6734,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } - if (isPSHUFHWMask(M, VT)) + if (isPSHUFHWMask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, getShufflePSHUFHWImmediate(SVOp), DAG); - if (isPSHUFLWMask(M, VT)) + if (isPSHUFLWMask(M, VT, HasAVX2)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, getShufflePSHUFLWImmediate(SVOp), DAG); @@ -6647,7 +6776,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -6689,7 +6818,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. - if (NumElems == 4 && VT.getSizeInBits() == 128) + if (NumElems == 4 && VT.is128BitVector()) return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); // Handle general 256-bit shuffles @@ -6705,7 +6834,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); - if (Op.getOperand(0).getValueType().getSizeInBits() != 128) + if (!Op.getOperand(0).getValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { @@ -6714,7 +6843,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT.getSizeInBits() == 16) { + } + + if (VT.getSizeInBits() == 16) { unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. if (Idx == 0) @@ -6729,7 +6860,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT == MVT::f32) { + } + + if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in @@ -6749,7 +6882,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); - } else if (VT == MVT::i32 || VT == MVT::i64) { + } + + if (VT == MVT::i32 || VT == MVT::i64) { // ExtractPS/pextrq works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; @@ -6769,22 +6904,22 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. - if (VecVT.getSizeInBits() == 256) { + if (VecVT.is256BitVector()) { DebugLoc dl = Op.getNode()->getDebugLoc(); unsigned NumElems = VecVT.getVectorNumElements(); SDValue Idx = Op.getOperand(1); unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); // Get the 128-bit vector. - bool Upper = IdxVal >= NumElems/2; - Vec = Extract128BitVector(Vec, - DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); + Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + if (IdxVal >= NumElems/2) + IdxVal -= NumElems/2; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, - Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); + DAG.getConstant(IdxVal, MVT::i32)); } - assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); + assert(VecVT.is128BitVector() && "Unexpected vector length"); if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); @@ -6811,7 +6946,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); - } else if (VT.getSizeInBits() == 32) { + } + + if (VT.getSizeInBits() == 32) { unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); if (Idx == 0) return Op; @@ -6823,7 +6960,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, DAG.getUNDEF(VVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0)); - } else if (VT.getSizeInBits() == 64) { + } + + if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. @@ -6856,7 +6995,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); - if (VT.getSizeInBits() == 256) + if (!VT.is128BitVector()) return SDValue(); if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && @@ -6876,7 +7015,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); return DAG.getNode(Opc, dl, VT, N0, N1, N2); - } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { + } + + if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into these // bits. For example (insert (extract, 3), 2) could be matched by putting @@ -6889,8 +7030,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && - isa<ConstantSDNode>(N2)) { + } + + if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { // PINSR* works with constant index. return Op; } @@ -6909,23 +7051,22 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // If this is a 256-bit vector result, first extract the 128-bit vector, // insert the element into the extracted half and then place it back. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { if (!isa<ConstantSDNode>(N2)) return SDValue(); // Get the desired 128-bit vector half. unsigned NumElems = VT.getVectorNumElements(); unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); - bool Upper = IdxVal >= NumElems/2; - SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); - SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); + SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, - N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); + bool Upper = IdxVal >= NumElems/2; + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, + DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32)); // Insert the changed part back to the 256-bit vector - return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); + return Insert128BitVector(N0, V, IdxVal, DAG, dl); } if (Subtarget->hasSSE41()) @@ -6954,7 +7095,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. - if (OpVT.getSizeInBits() > 128) { + if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. EVT VT128 = EVT::getVectorVT(*Context, OpVT.getVectorElementType(), @@ -6963,19 +7104,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. - return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, - DAG.getConstant(0, MVT::i32), - DAG, dl); + return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } - if (Op.getValueType() == MVT::v1i64 && + if (OpVT == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && - "Expected an SSE type!"); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + return DAG.getNode(ISD::BITCAST, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } @@ -6989,9 +7127,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getNode()->getOperand(0); SDValue Idx = Op.getNode()->getOperand(1); - if (Op.getNode()->getValueType(0).getSizeInBits() == 128 - && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { - return Extract128BitVector(Vec, Idx, DAG, dl); + if (Op.getNode()->getValueType(0).is128BitVector() && + Vec.getNode()->getValueType(0).is256BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Extract128BitVector(Vec, IdxVal, DAG, dl); } } return SDValue(); @@ -7008,9 +7148,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue SubVec = Op.getNode()->getOperand(1); SDValue Idx = Op.getNode()->getOperand(2); - if (Op.getNode()->getValueType(0).getSizeInBits() == 256 - && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { - return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); + if (Op.getNode()->getValueType(0).is256BitVector() && + SubVec.getNode()->getValueType(0).is128BitVector() && + isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); } } return SDValue(); @@ -7219,7 +7361,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, - unsigned char OperandFlags) { + unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); DebugLoc dl = GA->getDebugLoc(); @@ -7227,12 +7369,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, GA->getValueType(0), GA->getOffset(), OperandFlags); + + X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR + : X86ISD::TLSADDR; + if (InFlag) { SDValue Ops[] = { Chain, TGA, *InFlag }; - Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3); } else { SDValue Ops[] = { Chain, TGA }; - Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); + Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. @@ -7264,11 +7410,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, X86::RAX, X86II::MO_TLSGD); } -// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or -// "local exec" model. +static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT, + bool is64Bit) { + DebugLoc dl = GA->getDebugLoc(); + + // Get the start address of the TLS block for this module. + X86MachineFunctionInfo* MFI = DAG.getMachineFunction() + .getInfo<X86MachineFunctionInfo>(); + MFI->incNumLocalDynamicTLSAccesses(); + + SDValue Base; + if (is64Bit) { + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX, + X86II::MO_TLSLD, /*LocalDynamic=*/true); + } else { + SDValue InFlag; + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, + X86II::MO_TLSLDM, /*LocalDynamic=*/true); + } + + // Note: the CleanupLocalDynamicTLSPass will remove redundant computations + // of Base. + + // Build x@dtpoff. + unsigned char OperandFlags = X86II::MO_DTPOFF; + unsigned WrapperKind = X86ISD::Wrapper; + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + // Add x@dtpoff with the base. + return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, - bool is64Bit) { + bool is64Bit, bool isPIC) { DebugLoc dl = GA->getDebugLoc(); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). @@ -7286,25 +7470,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; - } else if (is64Bit) { - assert(model == TLSModel::InitialExec); - OperandFlags = X86II::MO_GOTTPOFF; - WrapperKind = X86ISD::WrapperRIP; + } else if (model == TLSModel::InitialExec) { + if (is64Bit) { + OperandFlags = X86II::MO_GOTTPOFF; + WrapperKind = X86ISD::WrapperRIP; + } else { + OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; + } } else { - assert(model == TLSModel::InitialExec); - OperandFlags = X86II::MO_INDNTPOFF; + llvm_unreachable("Unexpected model"); } - // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial - // exec) + // emit "addl x@ntpoff,%eax" (local exec) + // or "addl x@indntpoff,%eax" (initial exec) + // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); - if (model == TLSModel::InitialExec) + if (model == TLSModel::InitialExec) { + if (isPIC && !is64Bit) { + Offset = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), + Offset); + } + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, + 0); + } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -7318,29 +7513,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalValue *GV = GA->getGlobal(); if (Subtarget->isTargetELF()) { - // TODO: implement the "local dynamic" model - // TODO: implement the "initial exec"model for pic executables - - // If GV is an alias then use the aliasee for determining - // thread-localness. - if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) - GV = GA->resolveAliasedGlobal(false); - TLSModel::Model model = getTargetMachine().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: - case TLSModel::LocalDynamic: // not implemented if (Subtarget->is64Bit()) return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); - + case TLSModel::LocalDynamic: + return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(), + Subtarget->is64Bit()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, - Subtarget->is64Bit()); + Subtarget->is64Bit(), + getTargetMachine().getRelocationModel() == Reloc::PIC_); } - } else if (Subtarget->isTargetDarwin()) { + llvm_unreachable("Unknown TLS model."); + } + + if (Subtarget->isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? @@ -7383,7 +7575,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); - } else if (Subtarget->isTargetWindows()) { + } + + if (Subtarget->isTargetWindows()) { // Just use the implicit TLS architecture // Need to generate someting similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage @@ -7429,7 +7623,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), - getPointerTy()); + getPointerTy()); IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); @@ -7600,9 +7794,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ - haddpd %xmm0, %xmm0 + haddpd %xmm0, %xmm0 #else - pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ @@ -7693,12 +7887,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, // Handle final rounding. EVT DestVT = Op.getValueType(); - if (DestVT.bitsLT(MVT::f64)) { + if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, DAG.getIntPtrConstant(0)); - } else if (DestVT.bitsGT(MVT::f64)) { + if (DestVT.bitsGT(MVT::f64)) return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); - } // Handle final rounding. return Sub; @@ -7719,10 +7912,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, EVT DstVT = Op.getValueType(); if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); - else if (SrcVT == MVT::i32 && X86ScalarSSEf64) + if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); - else if (Subtarget->is64Bit() && - SrcVT == MVT::i64 && DstVT == MVT::f32) + if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. @@ -7899,9 +8091,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); - else - // The node is the result. - return FIST; + + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, @@ -7916,9 +8108,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), FIST, StackSlot, MachinePointerInfo(), false, false, false, 0); - else - // The node is the result. - return FIST; + + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -7931,7 +8123,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EltVT = VT.getVectorElementType(); Constant *C; if (EltVT == MVT::f64) { - C = ConstantVector::getSplat(2, + C = ConstantVector::getSplat(2, ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); } else { C = ConstantVector::getSplat(4, @@ -7965,15 +8157,15 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo::getConstantPool(), false, false, false, 16); if (VT.isVector()) { - MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; + MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, XORVT, - DAG.getNode(ISD::BITCAST, dl, XORVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); - } else { - return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + DAG.getNode(ISD::BITCAST, dl, XORVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } + + return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { @@ -8172,7 +8364,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, // Otherwise use a regular EFLAGS-setting instruction. switch (Op.getNode()->getOpcode()) { default: llvm_unreachable("unexpected operator!"); - case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::SUB: + Opcode = X86ISD::SUB; + break; case ISD::OR: Opcode = X86ISD::OR; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; @@ -8198,6 +8392,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); + if (Opcode == X86ISD::CMP) { + SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0), + Op.getOperand(1)); + // We can't replace usage of SUB with CMP. + // The SUB node will be removed later because there is no use of it. + return SDValue(New.getNode(), 0); + } + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector<SDValue, 4> Ops; for (unsigned i = 0; i != NumOperands; ++i) @@ -8217,9 +8419,41 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, return EmitTest(Op0, X86CC, DAG); DebugLoc dl = Op0.getDebugLoc(); + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || + Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, + Op0, Op1); + return SDValue(Sub.getNode(), 1); + } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); } +/// Convert a comparison if required by the subtarget. +SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, + SelectionDAG &DAG) const { + // If the subtarget does not support the FUCOMI instruction, floating-point + // comparisons have to be converted. + if (Subtarget->hasCMov() || + Cmp.getOpcode() != X86ISD::CMP || + !Cmp.getOperand(0).getValueType().isFloatingPoint() || + !Cmp.getOperand(1).getValueType().isFloatingPoint()) + return Cmp; + + // The instruction selector will select an FUCOM instruction instead of + // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence + // build an SDNode sequence that transfers the result from FPSW into EFLAGS: + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + DebugLoc dl = Cmp.getDebugLoc(); + SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); + SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); + SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, + DAG.getConstant(8, MVT::i8)); + SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); +} + /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, @@ -8341,6 +8575,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SDValue(); SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86CC, MVT::i8), EFLAGS); } @@ -8350,24 +8585,22 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && + assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && "Unsupported value type for operation"); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); DebugLoc dl = Op.getDebugLoc(); SDValue CC = Op.getOperand(2); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); // Issue the operation on the smaller types and concatenate the result back MVT EltVT = VT.getVectorElementType().getSimpleVT(); @@ -8389,10 +8622,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); if (isFP) { - unsigned SSECC = 8; +#ifndef NDEBUG EVT EltVT = Op0.getValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; + assert(EltVT == MVT::f32 || EltVT == MVT::f64); +#endif + unsigned SSECC; bool Swap = false; // SSE Condition code mapping: @@ -8405,7 +8640,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // 6 - NLE // 7 - ORD switch (SetCCOpcode) { - default: break; + default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: @@ -8419,33 +8654,33 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; - case ISD::SETULE: Swap = true; + case ISD::SETULE: Swap = true; // Fallthrough case ISD::SETUGE: SSECC = 5; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; // Fallthrough case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; } if (Swap) std::swap(Op0, Op1); // In the two special cases we can't handle, emit two comparisons. if (SSECC == 8) { + unsigned CC0, CC1; + unsigned CombineOpc; if (SetCCOpcode == ISD::SETUEQ) { - SDValue UNORD, EQ; - UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(0, MVT::i8)); - return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } else if (SetCCOpcode == ISD::SETONE) { - SDValue ORD, NEQ; - ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, - DAG.getConstant(4, MVT::i8)); - return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); + CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + } else { + assert(SetCCOpcode == ISD::SETONE); + CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; } - llvm_unreachable("Illegal FP comparison"); + + SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC0, MVT::i8)); + SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(CC1, MVT::i8)); + return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } // Handle all other FP comparisons here. return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, @@ -8453,17 +8688,17 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { } // Break 256-bit integer vector compare into smaller ones. - if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasAVX2()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc = 0; + unsigned Opc; bool Swap = false, Invert = false, FlipSigns = false; switch (SetCCOpcode) { - default: break; + default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETNE: Invert = true; case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; @@ -8480,10 +8715,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) - return SDValue(); - if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) - return SDValue(); + if (VT == MVT::v2i64) { + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) + return SDValue(); + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) + return SDValue(); + } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. @@ -8510,7 +8747,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); - if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || + Opc == X86ISD::SAHF) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || @@ -8542,6 +8780,16 @@ static bool isAllOnes(SDValue V) { return C && C->isAllOnesValue(); } +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { + if (V.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue VOp0 = V.getOperand(0); + unsigned InBits = VOp0.getValueSizeInBits(); + unsigned Bits = V.getValueSizeInBits(); + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); +} + SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Cond = Op.getOperand(0); @@ -8572,8 +8820,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Y = isAllOnes(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases + // (select (x != 0), -1, 0) -> neg & sbb + // (select (x == 0), 0, -1) -> neg & sbb + if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) + if (YC->isNullValue() && + (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, + DAG.getConstant(0, CmpOp0.getValueType()), + CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); SDValue Res = // Res = 0 or -1. DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), @@ -8654,9 +8919,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate. - if (Cond.getOpcode() == ISD::TRUNCATE) - Cond = Cond.getOperand(0); + // Look pass the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -8679,7 +8944,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry - if (Cond.getOpcode() == X86ISD::CMP) { + if (Cond.getOpcode() == X86ISD::SUB) { + Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && @@ -8918,6 +9184,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); @@ -8947,6 +9214,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); CC = DAG.getConstant(X86::COND_NE, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); @@ -8960,9 +9228,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate. - if (Cond.getOpcode() == ISD::TRUNCATE) - Cond = Cond.getOperand(0); + // Look pass the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. @@ -8980,6 +9248,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { CC = DAG.getConstant(X86::COND_NE, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DAG); } + Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cond); } @@ -9018,7 +9287,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const Function *F = MF.getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; I++) + I != E; ++I) if (I->hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); @@ -9201,12 +9470,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); if (isa<ConstantSDNode>(ShAmt)) { + // Constant may be a TargetConstant. Use a regular constant. + uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue(); switch (Opc) { default: llvm_unreachable("Unknown target vector shift node"); case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: - return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, + DAG.getConstant(ShiftAmt, MVT::i32)); } } @@ -9223,10 +9495,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, SDValue ShOps[4]; ShOps[0] = ShAmt; ShOps[1] = DAG.getConstant(0, MVT::i32); - ShOps[2] = DAG.getUNDEF(MVT::i32); - ShOps[3] = DAG.getUNDEF(MVT::i32); + ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + + // The return type has to be a 128-bit type with the same element + // type as the input type. + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + + ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } @@ -9261,8 +9538,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_sse2_ucomigt_sd: case Intrinsic::x86_sse2_ucomige_sd: case Intrinsic::x86_sse2_ucomineq_sd: { - unsigned Opc = 0; - ISD::CondCode CC = ISD::SETCC_INVALID; + unsigned Opc; + ISD::CondCode CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_comieq_ss: @@ -9336,245 +9613,102 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // XOP comparison intrinsics - case Intrinsic::x86_xop_vpcomltb: - case Intrinsic::x86_xop_vpcomltw: - case Intrinsic::x86_xop_vpcomltd: - case Intrinsic::x86_xop_vpcomltq: - case Intrinsic::x86_xop_vpcomltub: - case Intrinsic::x86_xop_vpcomltuw: - case Intrinsic::x86_xop_vpcomltud: - case Intrinsic::x86_xop_vpcomltuq: - case Intrinsic::x86_xop_vpcomleb: - case Intrinsic::x86_xop_vpcomlew: - case Intrinsic::x86_xop_vpcomled: - case Intrinsic::x86_xop_vpcomleq: - case Intrinsic::x86_xop_vpcomleub: - case Intrinsic::x86_xop_vpcomleuw: - case Intrinsic::x86_xop_vpcomleud: - case Intrinsic::x86_xop_vpcomleuq: - case Intrinsic::x86_xop_vpcomgtb: - case Intrinsic::x86_xop_vpcomgtw: - case Intrinsic::x86_xop_vpcomgtd: - case Intrinsic::x86_xop_vpcomgtq: - case Intrinsic::x86_xop_vpcomgtub: - case Intrinsic::x86_xop_vpcomgtuw: - case Intrinsic::x86_xop_vpcomgtud: - case Intrinsic::x86_xop_vpcomgtuq: - case Intrinsic::x86_xop_vpcomgeb: - case Intrinsic::x86_xop_vpcomgew: - case Intrinsic::x86_xop_vpcomged: - case Intrinsic::x86_xop_vpcomgeq: - case Intrinsic::x86_xop_vpcomgeub: - case Intrinsic::x86_xop_vpcomgeuw: - case Intrinsic::x86_xop_vpcomgeud: - case Intrinsic::x86_xop_vpcomgeuq: - case Intrinsic::x86_xop_vpcomeqb: - case Intrinsic::x86_xop_vpcomeqw: - case Intrinsic::x86_xop_vpcomeqd: - case Intrinsic::x86_xop_vpcomeqq: - case Intrinsic::x86_xop_vpcomequb: - case Intrinsic::x86_xop_vpcomequw: - case Intrinsic::x86_xop_vpcomequd: - case Intrinsic::x86_xop_vpcomequq: - case Intrinsic::x86_xop_vpcomneb: - case Intrinsic::x86_xop_vpcomnew: - case Intrinsic::x86_xop_vpcomned: - case Intrinsic::x86_xop_vpcomneq: - case Intrinsic::x86_xop_vpcomneub: - case Intrinsic::x86_xop_vpcomneuw: - case Intrinsic::x86_xop_vpcomneud: - case Intrinsic::x86_xop_vpcomneuq: - case Intrinsic::x86_xop_vpcomfalseb: - case Intrinsic::x86_xop_vpcomfalsew: - case Intrinsic::x86_xop_vpcomfalsed: - case Intrinsic::x86_xop_vpcomfalseq: - case Intrinsic::x86_xop_vpcomfalseub: - case Intrinsic::x86_xop_vpcomfalseuw: - case Intrinsic::x86_xop_vpcomfalseud: - case Intrinsic::x86_xop_vpcomfalseuq: - case Intrinsic::x86_xop_vpcomtrueb: - case Intrinsic::x86_xop_vpcomtruew: - case Intrinsic::x86_xop_vpcomtrued: - case Intrinsic::x86_xop_vpcomtrueq: - case Intrinsic::x86_xop_vpcomtrueub: - case Intrinsic::x86_xop_vpcomtrueuw: - case Intrinsic::x86_xop_vpcomtrueud: - case Intrinsic::x86_xop_vpcomtrueuq: { - unsigned CC = 0; - unsigned Opc = 0; - - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_xop_vpcomltb: - case Intrinsic::x86_xop_vpcomltw: - case Intrinsic::x86_xop_vpcomltd: - case Intrinsic::x86_xop_vpcomltq: - CC = 0; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomltub: - case Intrinsic::x86_xop_vpcomltuw: - case Intrinsic::x86_xop_vpcomltud: - case Intrinsic::x86_xop_vpcomltuq: - CC = 0; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomleb: - case Intrinsic::x86_xop_vpcomlew: - case Intrinsic::x86_xop_vpcomled: - case Intrinsic::x86_xop_vpcomleq: - CC = 1; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomleub: - case Intrinsic::x86_xop_vpcomleuw: - case Intrinsic::x86_xop_vpcomleud: - case Intrinsic::x86_xop_vpcomleuq: - CC = 1; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomgtb: - case Intrinsic::x86_xop_vpcomgtw: - case Intrinsic::x86_xop_vpcomgtd: - case Intrinsic::x86_xop_vpcomgtq: - CC = 2; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomgtub: - case Intrinsic::x86_xop_vpcomgtuw: - case Intrinsic::x86_xop_vpcomgtud: - case Intrinsic::x86_xop_vpcomgtuq: - CC = 2; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomgeb: - case Intrinsic::x86_xop_vpcomgew: - case Intrinsic::x86_xop_vpcomged: - case Intrinsic::x86_xop_vpcomgeq: - CC = 3; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomgeub: - case Intrinsic::x86_xop_vpcomgeuw: - case Intrinsic::x86_xop_vpcomgeud: - case Intrinsic::x86_xop_vpcomgeuq: - CC = 3; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomeqb: - case Intrinsic::x86_xop_vpcomeqw: - case Intrinsic::x86_xop_vpcomeqd: - case Intrinsic::x86_xop_vpcomeqq: - CC = 4; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomequb: - case Intrinsic::x86_xop_vpcomequw: - case Intrinsic::x86_xop_vpcomequd: - case Intrinsic::x86_xop_vpcomequq: - CC = 4; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomneb: - case Intrinsic::x86_xop_vpcomnew: - case Intrinsic::x86_xop_vpcomned: - case Intrinsic::x86_xop_vpcomneq: - CC = 5; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomneub: - case Intrinsic::x86_xop_vpcomneuw: - case Intrinsic::x86_xop_vpcomneud: - case Intrinsic::x86_xop_vpcomneuq: - CC = 5; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomfalseb: - case Intrinsic::x86_xop_vpcomfalsew: - case Intrinsic::x86_xop_vpcomfalsed: - case Intrinsic::x86_xop_vpcomfalseq: - CC = 6; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomfalseub: - case Intrinsic::x86_xop_vpcomfalseuw: - case Intrinsic::x86_xop_vpcomfalseud: - case Intrinsic::x86_xop_vpcomfalseuq: - CC = 6; - Opc = X86ISD::VPCOMU; - break; - case Intrinsic::x86_xop_vpcomtrueb: - case Intrinsic::x86_xop_vpcomtruew: - case Intrinsic::x86_xop_vpcomtrued: - case Intrinsic::x86_xop_vpcomtrueq: - CC = 7; - Opc = X86ISD::VPCOM; - break; - case Intrinsic::x86_xop_vpcomtrueub: - case Intrinsic::x86_xop_vpcomtrueuw: - case Intrinsic::x86_xop_vpcomtrueud: - case Intrinsic::x86_xop_vpcomtrueuq: - CC = 7; - Opc = X86ISD::VPCOMU; - break; - } - - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, - DAG.getConstant(CC, MVT::i8)); - } // Arithmetic intrinsics. case Intrinsic::x86_sse2_pmulu_dq: case Intrinsic::x86_avx2_pmulu_dq: return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + + // SSE3/AVX horizontal add/sub intrinsics case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: case Intrinsic::x86_avx_hadd_pd_256: - return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hsub_ps: case Intrinsic::x86_sse3_hsub_pd: case Intrinsic::x86_avx_hsub_ps_256: case Intrinsic::x86_avx_hsub_pd_256: - return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_ssse3_phadd_w_128: case Intrinsic::x86_ssse3_phadd_d_128: case Intrinsic::x86_avx2_phadd_w: case Intrinsic::x86_avx2_phadd_d: - return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_ssse3_phsub_w_128: case Intrinsic::x86_ssse3_phsub_d_128: case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: - return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), + case Intrinsic::x86_avx2_phsub_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse3_hadd_ps: + case Intrinsic::x86_sse3_hadd_pd: + case Intrinsic::x86_avx_hadd_ps_256: + case Intrinsic::x86_avx_hadd_pd_256: + Opcode = X86ISD::FHADD; + break; + case Intrinsic::x86_sse3_hsub_ps: + case Intrinsic::x86_sse3_hsub_pd: + case Intrinsic::x86_avx_hsub_ps_256: + case Intrinsic::x86_avx_hsub_pd_256: + Opcode = X86ISD::FHSUB; + break; + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + Opcode = X86ISD::HADD; + break; + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + Opcode = X86ISD::HSUB; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + } + + // AVX2 variable shift intrinsics case Intrinsic::x86_avx2_psllv_d: case Intrinsic::x86_avx2_psllv_q: case Intrinsic::x86_avx2_psllv_d_256: case Intrinsic::x86_avx2_psllv_q_256: - return DAG.getNode(ISD::SHL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_avx2_psrlv_d: case Intrinsic::x86_avx2_psrlv_q: case Intrinsic::x86_avx2_psrlv_d_256: case Intrinsic::x86_avx2_psrlv_q_256: - return DAG.getNode(ISD::SRL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - return DAG.getNode(ISD::SRA, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrav_d_256: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + Opcode = ISD::SHL; + break; + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + Opcode = ISD::SRL; + break; + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + Opcode = ISD::SRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: case Intrinsic::x86_ssse3_psign_w_128: case Intrinsic::x86_ssse3_psign_d_128: @@ -9583,15 +9717,18 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_psign_d: return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_insertps: return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vperm2f128_ps_256: case Intrinsic::x86_avx_vperm2f128_pd_256: case Intrinsic::x86_avx_vperm2f128_si_256: case Intrinsic::x86_avx2_vperm2i128: return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx2_permd: case Intrinsic::x86_avx2_permps: // Operands intentionally swapped. Mask is last operand to intrinsic, @@ -9621,7 +9758,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { bool IsTestPacked = false; - unsigned X86CC = 0; + unsigned X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx_vtestz_ps: @@ -9672,44 +9809,93 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: - return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psra_d: case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: - return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), + case Intrinsic::x86_avx2_psra_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + Opcode = X86ISD::VSHL; + break; + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + Opcode = X86ISD::VSRL; + break; + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + Opcode = X86ISD::VSRA; + break; + } + return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + } + + // SSE/AVX immediate shift intrinsics case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_avx2_pslli_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: - return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: - return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), + case Intrinsic::x86_avx2_psrai_d: { + unsigned Opcode; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + Opcode = X86ISD::VSHLI; + break; + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + Opcode = X86ISD::VSRLI; + break; + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + Opcode = X86ISD::VSRAI; + break; + } + return getTargetVShiftNode(Opcode, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), DAG); + } + // Fix vector shift instructions where the last operand is a non-immediate // i32 value. case Intrinsic::x86_mmx_pslli_w: @@ -9724,8 +9910,9 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const if (isa<ConstantSDNode>(ShAmt)) return SDValue(); - unsigned NewIntNo = 0; + unsigned NewIntNo; switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_mmx_pslli_w: NewIntNo = Intrinsic::x86_mmx_psll_w; break; @@ -9750,7 +9937,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_mmx_psrai_d: NewIntNo = Intrinsic::x86_mmx_psra_d; break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } // The vector shift intrinsics with scalars uses 32b shift amounts but @@ -9766,6 +9952,116 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(NewIntNo, MVT::i32), Op.getOperand(1), ShAmt); } + case Intrinsic::x86_sse42_pcmpistria128: + case Intrinsic::x86_sse42_pcmpestria128: + case Intrinsic::x86_sse42_pcmpistric128: + case Intrinsic::x86_sse42_pcmpestric128: + case Intrinsic::x86_sse42_pcmpistrio128: + case Intrinsic::x86_sse42_pcmpestrio128: + case Intrinsic::x86_sse42_pcmpistris128: + case Intrinsic::x86_sse42_pcmpestris128: + case Intrinsic::x86_sse42_pcmpistriz128: + case Intrinsic::x86_sse42_pcmpestriz128: { + unsigned Opcode; + unsigned X86CC; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse42_pcmpistria128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpestria128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpistric128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpestric128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpistrio128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpestrio128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpistris128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpestris128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpistriz128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_E; + break; + case Intrinsic::x86_sse42_pcmpestriz128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_E; + break; + } + SmallVector<SDValue, 5> NewOps; + NewOps.append(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), + SDValue(PCMP.getNode(), 1)); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + case Intrinsic::x86_sse42_pcmpistri128: + case Intrinsic::x86_sse42_pcmpestri128: { + unsigned Opcode; + if (IntNo == Intrinsic::x86_sse42_pcmpistri128) + Opcode = X86ISD::PCMPISTRI; + else + Opcode = X86ISD::PCMPESTRI; + + SmallVector<SDValue, 5> NewOps; + NewOps.append(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); + } + } +} + +SDValue +X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + + // RDRAND intrinsics. + case Intrinsic::x86_rdrand_16: + case Intrinsic::x86_rdrand_32: + case Intrinsic::x86_rdrand_64: { + // Emit the node with the right value type. + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); + SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0)); + + // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise + // return the value from Rand, which is always 0, casted to i32. + SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, Op->getValueType(1)), + DAG.getConstant(X86::COND_B, MVT::i32), + SDValue(Result.getNode(), 1) }; + SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, + DAG.getVTList(Op->getValueType(1), MVT::Glue), + Ops, 4); + + // Return { result, isValid, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, + SDValue(Result.getNode(), 2)); + } } } @@ -9816,7 +10112,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); @@ -9833,7 +10128,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); - MF.getRegInfo().addLiveOut(StoreAddrReg); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, @@ -10149,23 +10443,21 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert(VT.getSizeInBits() == 256 && VT.isInteger() && + assert(VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); DebugLoc dl = Op.getDebugLoc(); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); // Extract the RHS vectors SDValue RHS = Op.getOperand(1); - SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); - SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -10176,14 +10468,14 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { } SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType().getSizeInBits() == 256 && + assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType().getSizeInBits() == 256 && + assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); @@ -10193,7 +10485,7 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) + if (VT.is256BitVector() && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); assert((VT == MVT::v2i64 || VT == MVT::v4i64) && @@ -10310,6 +10602,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } + llvm_unreachable("Unknown shift opcode."); } if (Subtarget->hasAVX2() && VT == MVT::v32i8) { @@ -10353,6 +10646,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } + llvm_unreachable("Unknown shift opcode."); } } } @@ -10421,15 +10715,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { } // Decompose 256-bit shifts into smaller 128-bit shifts. - if (VT.getSizeInBits() == 256) { + if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors - SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + SDValue V1 = Extract128BitVector(R, 0, DAG, dl); + SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); // Recreate the shift amount vectors SDValue Amt1, Amt2; @@ -10448,9 +10741,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { &Amt2Csts[0], NumElems/2); } else { // Variable shift amount - Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); - Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + Amt1 = Extract128BitVector(Amt, 0, DAG, dl); + Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); } // Issue new vector shifts for the smaller types @@ -10560,20 +10852,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return SDValue(); if (!Subtarget->hasAVX2()) { // needs to be split - int NumElems = VT.getVectorNumElements(); - SDValue Idx0 = DAG.getConstant(0, MVT::i32); - SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + unsigned NumElems = VT.getVectorNumElements(); // Extract the LHS vectors SDValue LHS = Op.getOperand(0); - SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); - SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); EVT ExtraEltVT = ExtraVT.getVectorElementType(); - int ExtraNumElems = ExtraVT.getVectorNumElements(); + unsigned ExtraNumElems = ExtraVT.getVectorNumElements(); ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, ExtraNumElems/2); SDValue Extra = DAG.getValueType(ExtraVT); @@ -10859,6 +11149,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: @@ -10913,9 +11204,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, Results.push_back(Swap.getValue(1)); } -void X86TargetLowering:: +static void ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, - SelectionDAG &DAG, unsigned NewOp) const { + SelectionDAG &DAG, unsigned NewOp) { DebugLoc dl = Node->getDebugLoc(); assert (Node->getValueType(0) == MVT::i64 && "Only know how to expand i64 atomics"); @@ -11013,7 +11304,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RBX : X86::EBX, swapInL, cpInH.getValue(1)); swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, - Regs64bit ? X86::RCX : X86::ECX, + Regs64bit ? X86::RCX : X86::ECX, swapInH, swapInL.getValue(1)); SDValue Ops[] = { swapInH.getValue(0), N->getOperand(1), @@ -11036,26 +11327,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ATOMIC_LOAD_ADD: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); - return; case ISD::ATOMIC_LOAD_AND: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); - return; case ISD::ATOMIC_LOAD_NAND: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); - return; case ISD::ATOMIC_LOAD_OR: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); - return; case ISD::ATOMIC_LOAD_SUB: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); - return; case ISD::ATOMIC_LOAD_XOR: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); - return; - case ISD::ATOMIC_SWAP: - ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); + case ISD::ATOMIC_SWAP: { + unsigned Opc; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::ATOMIC_LOAD_ADD: + Opc = X86ISD::ATOMADD64_DAG; + break; + case ISD::ATOMIC_LOAD_AND: + Opc = X86ISD::ATOMAND64_DAG; + break; + case ISD::ATOMIC_LOAD_NAND: + Opc = X86ISD::ATOMNAND64_DAG; + break; + case ISD::ATOMIC_LOAD_OR: + Opc = X86ISD::ATOMOR64_DAG; + break; + case ISD::ATOMIC_LOAD_SUB: + Opc = X86ISD::ATOMSUB64_DAG; + break; + case ISD::ATOMIC_LOAD_XOR: + Opc = X86ISD::ATOMXOR64_DAG; + break; + case ISD::ATOMIC_SWAP: + Opc = X86ISD::ATOMSWAP64_DAG; + break; + } + ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc); return; + } case ISD::ATOMIC_LOAD: ReplaceATOMIC_LOAD(N, Results, DAG); } @@ -11118,10 +11423,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; + case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; @@ -11131,7 +11438,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -11190,6 +11499,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; + case X86ISD::SAHF: return "X86ISD::SAHF"; + case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::FMADD: return "X86ISD::FMADD"; + case X86ISD::FMSUB: return "X86ISD::FMSUB"; + case X86ISD::FNMADD: return "X86ISD::FNMADD"; + case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; + case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; + case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; } } @@ -11258,6 +11575,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { return true; } +bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return Imm == (int32_t)Imm; +} + +bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { + // Can also use sub to handle negated immediates. + return Imm == (int32_t)Imm; +} + bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isInteger() || !VT2.isInteger()) return false; @@ -11300,8 +11626,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isMOVLMask(M, VT) || isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || - isPSHUFHWMask(M, VT) || - isPSHUFLWMask(M, VT) || + isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) || + isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) || isPALIGNRMask(M, VT, Subtarget) || isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || @@ -11316,7 +11642,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, // FIXME: This collection of masks seems suspect. if (NumElts == 2) return true; - if (NumElts == 4 && VT.getSizeInBits() == 128) { + if (NumElts == 4 && VT.is128BitVector()) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || @@ -11460,7 +11786,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, // result in out1, out2 // fallthrough -->nextMBB - const TargetRegisterClass *RC = X86::GR32RegisterClass; + const TargetRegisterClass *RC = &X86::GR32RegClass; const unsigned LoadOpc = X86::MOV32rm; const unsigned NotOpc = X86::NOT32r; const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); @@ -11662,7 +11988,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] int valArgIndx = lastAddrIndx + 1; - unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); for (int i=0; i <= lastAddrIndx; ++i) (*MIB).addOperand(*argOpers[i]); @@ -11672,7 +11998,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, argOpers[valArgIndx]->isImm()) && "invalid operand"); - unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); if (argOpers[valArgIndx]->isReg()) MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); else @@ -11687,7 +12013,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MIB.addReg(t2); // Generate movc - unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); MIB.addReg(t2); MIB.addReg(t1); @@ -11742,8 +12068,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, MIB.addOperand(Op); } BuildMI(*BB, MI, dl, - TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr), - MI->getOperand(0).getReg()) + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); @@ -11776,24 +12101,6 @@ X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { } MachineBasicBlock * -X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { - DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - - // First arg in ECX, the second in EAX. - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) - .addReg(MI->getOperand(0).getReg()); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) - .addReg(MI->getOperand(1).getReg()); - - // The instruction doesn't actually take any operands though. - BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); - - MI->eraseFromParent(); // The pseudo is gone now. - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -12306,8 +12613,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) + .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) + .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) @@ -12517,7 +12825,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // Load the old value of the high byte of the control word... unsigned OldCW = - F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); + F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); @@ -12582,22 +12890,35 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // String/text processing lowering. case X86::PCMPISTRM128REG: case X86::VPCMPISTRM128REG: - return EmitPCMP(MI, BB, 3, false /* in-mem */); case X86::PCMPISTRM128MEM: case X86::VPCMPISTRM128MEM: - return EmitPCMP(MI, BB, 3, true /* in-mem */); case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: - return EmitPCMP(MI, BB, 5, false /* in mem */); case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - return EmitPCMP(MI, BB, 5, true /* in mem */); + case X86::VPCMPESTRM128MEM: { + unsigned NumArgs; + bool MemArg; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: + NumArgs = 3; MemArg = false; break; + case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: + NumArgs = 3; MemArg = true; break; + case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: + NumArgs = 5; MemArg = false; break; + case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: + NumArgs = 5; MemArg = true; break; + } + return EmitPCMP(MI, BB, NumArgs, MemArg); + } // Thread synchronization. case X86::MONITOR: return EmitMonitor(MI, BB); - case X86::MWAIT: - return EmitMwait(MI, BB); // Atomic Lowering. case X86::ATOMAND32: @@ -12605,25 +12926,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMOR32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, X86::OR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMXOR32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, X86::XOR32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass); + &X86::GR32RegClass); case X86::ATOMNAND32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, X86::AND32ri, X86::MOV32rm, X86::LCMPXCHG32, X86::NOT32r, X86::EAX, - X86::GR32RegisterClass, true); + &X86::GR32RegClass, true); case X86::ATOMMIN32: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); case X86::ATOMMAX32: @@ -12638,25 +12959,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMOR16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, X86::OR16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMXOR16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, X86::XOR16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass); + &X86::GR16RegClass); case X86::ATOMNAND16: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, X86::AND16ri, X86::MOV16rm, X86::LCMPXCHG16, X86::NOT16r, X86::AX, - X86::GR16RegisterClass, true); + &X86::GR16RegClass, true); case X86::ATOMMIN16: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); case X86::ATOMMAX16: @@ -12671,25 +12992,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMOR8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, X86::OR8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMXOR8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, X86::XOR8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass); + &X86::GR8RegClass); case X86::ATOMNAND8: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, X86::AND8ri, X86::MOV8rm, X86::LCMPXCHG8, X86::NOT8r, X86::AL, - X86::GR8RegisterClass, true); + &X86::GR8RegClass, true); // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. // This group is for 64-bit host. case X86::ATOMAND64: @@ -12697,25 +13018,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, X86::AND64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMOR64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, X86::OR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMXOR64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, X86::XOR64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass); + &X86::GR64RegClass); case X86::ATOMNAND64: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, X86::AND64ri32, X86::MOV64rm, X86::LCMPXCHG64, X86::NOT64r, X86::RAX, - X86::GR64RegisterClass, true); + &X86::GR64RegClass, true); case X86::ATOMMIN64: return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); case X86::ATOMMAX64: @@ -12870,10 +13191,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, /// inserting the result into the low part of a new 256-bit vector static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) + for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; @@ -12886,10 +13207,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { /// inserting the result into the high part of a new 256-bit vector static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) + for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || SVOp->getMaskElt(j) >= 0) return false; @@ -12906,7 +13227,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && V2.getOpcode() == ISD::CONCAT_VECTORS) { @@ -12931,30 +13252,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // To match the shuffle mask, the first half of the mask should // be exactly the first vector, and all the rest a splat with the // first element of the second one. - for (int i = 0; i < NumElems/2; ++i) + for (unsigned i = 0; i != NumElems/2; ++i) if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { - SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); - SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, - Ld->getMemoryVT(), - Ld->getPointerInfo(), - Ld->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); - } + if (Ld->hasNUsesOfValue(1, 0)) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + } // Emit a zeroed vector and insert the desired subvector on its // first half. SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), - DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); return DCI.CombineTo(N, InsV); } @@ -12964,18 +13286,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), - V, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); return DCI.CombineTo(N, InsV); } // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), - V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + SDValue V = Extract128BitVector(V1, 0, DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); return DCI.CombineTo(N, InsV); } @@ -12995,12 +13314,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && + if (Subtarget->hasAVX() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // Only handle 128 wide vector from here on. - if (VT.getSizeInBits() != 128) + if (!VT.is128BitVector()) return SDValue(); // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, @@ -13014,16 +13333,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } -/// PerformTruncateCombine - Converts truncate operation to +/// DCI, PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector -SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, +SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const { if (!DCI.isBeforeLegalizeOps()) return SDValue(); - if (!Subtarget->hasAVX()) return SDValue(); + if (!Subtarget->hasAVX()) + return SDValue(); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); @@ -13032,55 +13352,102 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + if (Subtarget->hasAVX2()) { + // AVX2: v4i64 -> v4i32 + + // VPERMD + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op); + Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32), + ShufMask); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, + DAG.getIntPtrConstant(0)); + } + + // AVX: v4i64 -> v4i32 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, - DAG.getIntPtrConstant(2)); + DAG.getIntPtrConstant(2)); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); // PSHUFD - int ShufMask1[] = {0, 2, 0, 0}; + static const int ShufMask1[] = {0, 2, 0, 0}; - OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), - ShufMask1); - OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), - ShufMask1); + SDValue Undef = DAG.getUNDEF(VT); + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1); // MOVLHPS - int ShufMask2[] = {0, 1, 4, 5}; + static const int ShufMask2[] = {0, 1, 4, 5}; return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + if (Subtarget->hasAVX2()) { + // AVX2: v8i32 -> v8i16 + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op); + + // PSHUFB + SmallVector<SDValue,32> pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8, + &pshufbMask[0], 32); + Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV); + + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op); + + static const int ShufMask[] = {0, 2, -1, -1}; + Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + + Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + return DAG.getNode(ISD::BITCAST, dl, VT, Op); + } + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(0)); + DAG.getIntPtrConstant(0)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, - DAG.getIntPtrConstant(4)); + DAG.getIntPtrConstant(4)); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); // PSHUFB - int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, - -1, -1, -1, -1, -1, -1, -1, -1}; + static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; - OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, - DAG.getUNDEF(MVT::v16i8), - ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, - DAG.getUNDEF(MVT::v16i8), - ShufMask1); + SDValue Undef = DAG.getUNDEF(MVT::v16i8); + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1); OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); // MOVLHPS - int ShufMask2[] = {0, 1, 4, 5}; + static const int ShufMask2[] = {0, 1, 4, 5}; SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); @@ -13127,7 +13494,8 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, + UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. @@ -13276,8 +13644,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - - DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. @@ -13559,9 +13925,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // to simplify previous instructions. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && - !DCI.isBeforeLegalize() && - TLI.isOperationLegal(ISD::VSELECT, VT)) { + !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + + // Don't optimize vector selects that map to mask-registers. + if (BitWidth == 1) + return SDValue(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -13576,6 +13946,88 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Check whether a boolean test is testing a boolean value generated by +// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition +// code. +// +// Simplify the following patterns: +// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) +// to (Op EFLAGS Cond) +// +// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) +// to (Op EFLAGS !Cond) +// +// where Op could be BRCOND or CMOV. +// +static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { + // Quit if not CMP and SUB with its value result used. + if (Cmp.getOpcode() != X86ISD::CMP && + (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) + return SDValue(); + + // Quit if not used as a boolean value. + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + // Check CMP operands. One of them should be 0 or 1 and the other should be + // an SetCC or extended from it. + SDValue Op1 = Cmp.getOperand(0); + SDValue Op2 = Cmp.getOperand(1); + + SDValue SetCC; + const ConstantSDNode* C = 0; + bool needOppositeCond = (CC == X86::COND_E); + + if ((C = dyn_cast<ConstantSDNode>(Op1))) + SetCC = Op2; + else if ((C = dyn_cast<ConstantSDNode>(Op2))) + SetCC = Op1; + else // Quit if all operands are not constants. + return SDValue(); + + if (C->getZExtValue() == 1) + needOppositeCond = !needOppositeCond; + else if (C->getZExtValue() != 0) + // Quit if the constant is neither 0 or 1. + return SDValue(); + + // Skip 'zext' node. + if (SetCC.getOpcode() == ISD::ZERO_EXTEND) + SetCC = SetCC.getOperand(0); + + // Quit if not SETCC. + // FIXME: So far we only handle the boolean value generated from SETCC. If + // there is other ways to generate boolean values, we need handle them here + // as well. + if (SetCC.getOpcode() != X86ISD::SETCC) + return SDValue(); + + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + + return SetCC.getOperand(1); +} + +static bool IsValidFCMOVCondition(X86::CondCode CC) { + switch (CC) { + default: + return false; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_E: + case X86::COND_P: + case X86::COND_AE: + case X86::COND_A: + case X86::COND_NE: + case X86::COND_NP: + return true; + } +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { @@ -13589,6 +14041,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); + if (CC == X86::COND_E || CC == X86::COND_NE) { switch (Cond.getOpcode()) { default: break; @@ -13600,6 +14053,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } + SDValue Flags; + + Flags = BoolTestSetCCCombine(Cond, CC); + if (Flags.getNode() && + // Extra check as FCMOV only supports a subset of X86 cond. + (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) { + SDValue Ops[] = { FalseOp, TrueOp, + DAG.getConstant(CC, MVT::i8), Flags }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), + Ops, array_lengthof(Ops)); + } + // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. @@ -14022,7 +14487,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) { // Sometimes the operand may come from a insert_subvector building a 256-bit // allones vector - if (VT.getSizeInBits() == 256 && + if (VT.is256BitVector() && N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); @@ -14260,6 +14725,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Generate NEG and CMOV for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Since X86 does not have CMOV for 8-bit integer, we don't convert + // 8-bit integer abs to NEG and CMOV. + if (VT.isInteger() && VT.getSizeInBits() == 8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CMOV. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && + N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && + N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { + // Generate SUB & CMOV. + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, VT), N0.getOperand(0)); + + SDValue Ops[] = { N0.getOperand(0), Neg, + DAG.getConstant(X86::COND_GE, MVT::i8), + SDValue(Neg.getNode(), 1) }; + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), + Ops, array_lengthof(Ops)); + } + return SDValue(); +} + // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -14267,6 +14767,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (Subtarget->hasCMov()) { + SDValue RV = performIntegerAbsCombine(N, DAG); + if (RV.getNode()) + return RV; + } + + // Try forming BMI if it is available. + if (!Subtarget->hasBMI()) + return SDValue(); + EVT VT = N->getValueType(0); if (VT != MVT::i32 && VT != MVT::i64) @@ -14292,7 +14802,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { LoadSDNode *Ld = cast<LoadSDNode>(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); @@ -14314,63 +14825,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, unsigned RegSz = RegVT.getSizeInBits(); unsigned MemSz = MemVT.getSizeInBits(); assert(RegSz > MemSz && "Register size must be greater than the mem size"); - // All sizes must be a power of two - if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); - // Attempt to load the original value using a single load op. - // Find a scalar type which is equal to the loaded word size. + // All sizes must be a power of two. + if (!isPowerOf2_32(RegSz * MemSz * NumElems)) + return SDValue(); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. MVT SclrLoadTy = MVT::i8; for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { SclrLoadTy = Tp; - break; } } - // Proceed if a load word is found. - if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, RegSz/SclrLoadTy.getSizeInBits()); + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), RegSz/MemVT.getScalarType().getSizeInBits()); - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - // Perform a single load. - SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), - Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->isInvariant(), - Ld->getAlignment()); + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); - // Insert the word loaded into a vector. - SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - LoadUnitVecVT, ScalarLoad); + // We can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), + Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, - ScalarInVector); + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); unsigned SizeRatio = RegSz/MemSz; // Redistribute the loaded elements into the different locations. SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i*SizeRatio] = i; SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(SlicedVec.getValueType()), - ShuffleVec.data()); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); // Bitcast to the requested type. Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); // Replace the original load with the new sequence // and return the new chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); - return SDValue(ScalarLoad.getNode(), 1); + return DCI.CombineTo(N, Shuff, TF, true); } return SDValue(); @@ -14387,13 +14929,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we are saving a concatenation of two XMM registers, perform two stores. - // This is better in Sandy Bridge cause one 256-bit mem op is done via two - // 128-bit ones. If in the future the cost becomes only one memory access the - // first version would be better. - if (VT.getSizeInBits() == 256 && - StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && - StoredVal.getNumOperands() == 2) { - + // On Sandy Bridge, 256-bit memory operations are executed by two + // 128-bit ports. However, on Haswell it is better to issue a single 256-bit + // memory operation. + if (VT.is256BitVector() && !Subtarget->hasAVX2() && + StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && + StoredVal.getNumOperands() == 2) { SDValue Value0 = StoredVal.getOperand(0); SDValue Value1 = StoredVal.getOperand(1); @@ -14438,14 +14979,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; - // Can't shuffle using an illegal type - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec.data()); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. @@ -14454,13 +14997,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); SmallVector<SDValue, 8> Chains; @@ -14469,7 +15017,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ptr = St->getBasePtr(); // Perform one or more big stores into memory. - for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StoreType, ShuffWide, DAG.getIntPtrConstant(i)); @@ -14818,18 +15366,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) return SDValue(); - if (!Subtarget->hasAVX()) + if (!Subtarget->hasAVX()) return SDValue(); - // Optimize vectors in AVX mode - // Sign extend v8i16 to v8i32 and - // v4i32 to v4i64 - // - // Divide input vector into two parts - // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} - // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 - // concat the vectors to original VT - EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT OpVT = Op.getValueType(); @@ -14838,23 +15377,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + unsigned NumElems = OpVT.getVectorNumElements(); + SDValue Undef = DAG.getUNDEF(OpVT); + SmallVector<int,8> ShufMask1(NumElems, -1); - for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask1[i] = i; - SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - ShufMask1.data()); + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]); SmallVector<int,8> ShufMask2(NumElems, -1); - for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask2[i] = i + NumElems/2; - SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), - ShufMask2.data()); + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]); - EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), VT.getVectorNumElements()/2); - OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); @@ -14862,7 +15415,42 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + EVT ScalarVT = VT.getScalarType(); + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + return SDValue(); + + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + SDValue C = N->getOperand(2); + + bool NegA = (A.getOpcode() == ISD::FNEG); + bool NegB = (B.getOpcode() == ISD::FNEG); + bool NegC = (C.getOpcode() == ISD::FNEG); + + // Negative multiplication when NegA xor NegB + bool NegMul = (NegA != NegB); + if (NegA) + A = A.getOperand(0); + if (NegB) + B = B.getOperand(0); + if (NegC) + C = C.getOperand(0); + + unsigned Opcode; + if (!NegMul) + Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + else + Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); +} + static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) @@ -14887,6 +15475,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } + // Optimize vectors in AVX mode: // // v8i16 -> v8i32 @@ -14899,50 +15488,139 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // - if (Subtarget->hasAVX()) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) + return SDValue(); - if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || + if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { - SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); - SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); - SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0); - EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements()/2); + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); + SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec); + SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec); - OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); - OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); - } + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } + return SDValue(); +} + +// Optimize x == -y --> x+y == 0 +// x != -y --> x+y != 0 +static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) + if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + LHS.getValueType(), RHS, LHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) + if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(), + RHS.getValueType(), LHS, RHS.getOperand(1)); + return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0), + addV, DAG.getConstant(0, addV.getValueType()), CC); + } return SDValue(); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { - unsigned X86CC = N->getConstantOperandVal(0); - SDValue EFLAG = N->getOperand(1); DebugLoc DL = N->getDebugLoc(); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); + SDValue EFLAGS = N->getOperand(1); // Materialize "setb reg" as "sbb reg,reg", since it can be extended without // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. - if (X86CC == X86::COND_B) + if (CC == X86::COND_B) return DAG.getNode(ISD::AND, DL, MVT::i8, DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), EFLAG), + DAG.getConstant(CC, MVT::i8), EFLAGS), DAG.getConstant(1, MVT::i8)); + SDValue Flags; + + Flags = BoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); + } + + return SDValue(); +} + +// Optimize branch condition evaluation. +// +static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + SDValue Chain = N->getOperand(0); + SDValue Dest = N->getOperand(1); + SDValue EFLAGS = N->getOperand(3); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); + + SDValue Flags; + + Flags = BoolTestSetCCCombine(EFLAGS, CC); + if (Flags.getNode()) { + SDValue Cond = DAG.getConstant(CC, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, + Flags); + } + + return SDValue(); +} + +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + + // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + // Notice that we use SINT_TO_FP because we know that the high bits + // are zero and SINT_TO_FP is better supported by the hardware. + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + return SDValue(); } static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + + // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) + if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { @@ -14961,6 +15639,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() + if (VT == MVT::v8i8 || VT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; + SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, I); + } + + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -15095,9 +15787,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); - case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -15105,10 +15799,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); + case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: case X86ISD::UNPCKH: @@ -15123,6 +15820,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILP: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); } return SDValue(); @@ -15652,55 +16350,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // in the normal allocation? case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget->is64Bit()) { - if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, X86::GR32RegisterClass); - else if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16RegisterClass); - else if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8RegisterClass); - else if (VT == MVT::i64 || VT == MVT::f64) - return std::make_pair(0U, X86::GR64RegisterClass); - break; + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i64 || VT == MVT::f64) + return std::make_pair(0U, &X86::GR64RegClass); + break; } // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, X86::GR32_ABCDRegisterClass); - else if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16_ABCDRegisterClass); - else if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); - else if (VT == MVT::i64) - return std::make_pair(0U, X86::GR64_ABCDRegisterClass); + return std::make_pair(0U, &X86::GR32_ABCDRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_ABCDRegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); + if (VT == MVT::i64) + return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8RegisterClass); + return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16RegisterClass); + return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) - return std::make_pair(0U, X86::GR32RegisterClass); - return std::make_pair(0U, X86::GR64RegisterClass); + return std::make_pair(0U, &X86::GR32RegClass); + return std::make_pair(0U, &X86::GR64RegClass); case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) - return std::make_pair(0U, X86::GR8_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) - return std::make_pair(0U, X86::GR16_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || !Subtarget->is64Bit()) - return std::make_pair(0U, X86::GR32_NOREXRegisterClass); - return std::make_pair(0U, X86::GR64_NOREXRegisterClass); + return std::make_pair(0U, &X86::GR32_NOREXRegClass); + return std::make_pair(0U, &X86::GR64_NOREXRegClass); case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) - return std::make_pair(0U, X86::RFP32RegisterClass); + return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) - return std::make_pair(0U, X86::RFP64RegisterClass); - return std::make_pair(0U, X86::RFP80RegisterClass); + return std::make_pair(0U, &X86::RFP64RegClass); + return std::make_pair(0U, &X86::RFP80RegClass); case 'y': // MMX_REGS if MMX allowed. if (!Subtarget->hasMMX()) break; - return std::make_pair(0U, X86::VR64RegisterClass); + return std::make_pair(0U, &X86::VR64RegClass); case 'Y': // SSE_REGS if SSE2 allowed if (!Subtarget->hasSSE2()) break; // FALL THROUGH. @@ -15712,10 +16410,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // Scalar SSE types. case MVT::f32: case MVT::i32: - return std::make_pair(0U, X86::FR32RegisterClass); + return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: - return std::make_pair(0U, X86::FR64RegisterClass); + return std::make_pair(0U, &X86::FR64RegClass); // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -15723,7 +16421,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: - return std::make_pair(0U, X86::VR128RegisterClass); + return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v32i8: case MVT::v16i16: @@ -15731,8 +16429,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: - return std::make_pair(0U, X86::VR256RegisterClass); - + return std::make_pair(0U, &X86::VR256RegClass); } break; } @@ -15755,28 +16452,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Constraint[6] == '}') { Res.first = X86::ST0+Constraint[4]-'0'; - Res.second = X86::RFP80RegisterClass; + Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { Res.first = X86::ST0; - Res.second = X86::RFP80RegisterClass; + Res.second = &X86::RFP80RegClass; return Res; } // flags -> EFLAGS if (StringRef("{flags}").equals_lower(Constraint)) { Res.first = X86::EFLAGS; - Res.second = X86::CCRRegisterClass; + Res.second = &X86::CCRRegClass; return Res; } // 'A' means EAX + EDX. if (Constraint == "A") { Res.first = X86::EAX; - Res.second = X86::GR32_ADRegisterClass; + Res.second = &X86::GR32_ADRegClass; return Res; } return Res; @@ -15792,7 +16489,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we // really want an 8-bit or 32-bit register, map to the appropriate register // class and return the appropriate register. - if (Res.second == X86::GR16RegisterClass) { + if (Res.second == &X86::GR16RegClass) { if (VT == MVT::i8) { unsigned DestReg = 0; switch (Res.first) { @@ -15804,7 +16501,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR8RegisterClass; + Res.second = &X86::GR8RegClass; } } else if (VT == MVT::i32) { unsigned DestReg = 0; @@ -15821,7 +16518,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR32RegisterClass; + Res.second = &X86::GR32RegClass; } } else if (VT == MVT::i64) { unsigned DestReg = 0; @@ -15838,22 +16535,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, } if (DestReg) { Res.first = DestReg; - Res.second = X86::GR64RegisterClass; + Res.second = &X86::GR64RegClass; } } - } else if (Res.second == X86::FR32RegisterClass || - Res.second == X86::FR64RegisterClass || - Res.second == X86::VR128RegisterClass) { + } else if (Res.second == &X86::FR32RegClass || + Res.second == &X86::FR64RegClass || + Res.second == &X86::VR128RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. - if (VT == MVT::f32) - Res.second = X86::FR32RegisterClass; - else if (VT == MVT::f64) - Res.second = X86::FR64RegisterClass; - else if (X86::VR128RegisterClass->hasType(VT)) - Res.second = X86::VR128RegisterClass; + + if (VT == MVT::f32 || VT == MVT::i32) + Res.second = &X86::FR32RegClass; + else if (VT == MVT::f64 || VT == MVT::i64) + Res.second = &X86::FR64RegClass; + else if (X86::VR128RegClass.hasType(VT)) + Res.second = &X86::VR128RegClass; + else if (X86::VR256RegClass.hasType(VT)) + Res.second = &X86::VR256RegClass; } return Res; |