diff options
author | dim <dim@FreeBSD.org> | 2012-04-14 13:54:10 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2012-04-14 13:54:10 +0000 |
commit | 1fc08f5e9ef733ef1ce6f363fecedc2260e78974 (patch) | |
tree | 19c69a04768629f2d440944b71cbe90adae0b615 /lib/Target/X86/X86ISelLowering.cpp | |
parent | 07637c87f826cdf411f0673595e9bc92ebd793f2 (diff) | |
download | FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.zip FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.tar.gz |
Vendor import of llvm trunk r154661:
http://llvm.org/svn/llvm-project/llvm/trunk@r154661
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 5371 |
1 files changed, 3109 insertions, 2262 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7c8ce17..9b83aad 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "x86-isel" +#include "X86ISelLowering.h" #include "X86.h" #include "X86InstrBuilder.h" -#include "X86ISelLowering.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "Utils/X86ShuffleDecode.h" @@ -35,25 +35,21 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/VectorExtras.h" +#include "llvm/ADT/VariadicFunction.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include <bitset> using namespace llvm; -using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -61,17 +57,6 @@ STATISTIC(NumTailCalls, "Number of tail calls"); static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); -static SDValue Insert128BitVector(SDValue Result, - SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - -static SDValue Extract128BitVector(SDValue Vec, - SDValue Idx, - SelectionDAG &DAG, - DebugLoc dl); - /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 instruction or a /// simple subregister reference. Idx is an index in the 128 bits we @@ -169,8 +154,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget<X86Subtarget>(); - X86ScalarSSEf64 = Subtarget->hasXMMInt(); - X86ScalarSSEf32 = Subtarget->hasXMM(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -186,8 +171,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. + // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling. if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); + else if (Subtarget->isAtom()) + setSchedulingPreference(Sched::Hybrid); else setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -199,15 +187,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); - setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); - setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); - setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); + + // The _ftol2 runtime function has an unusual calling conv, which + // is modeled by a special pseudo-instruction. + setLibcallName(RTLIB::FPTOUINT_F64_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I64, 0); + setLibcallName(RTLIB::FPTOUINT_F64_I32, 0); + setLibcallName(RTLIB::FPTOUINT_F32_I32, 0); } if (Subtarget->isTargetDarwin()) { @@ -256,8 +247,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); - } else if (!UseSoftFloat) { + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -271,7 +262,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); @@ -314,7 +305,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. @@ -327,6 +318,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); } + if (isTargetFTOL()) { + // Use the _ftol2 runtime function, which has a pseudo-instruction + // to handle its weird calling convention. + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } + // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); @@ -379,10 +376,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + // Promote the i8 variants and force them on up to i32 which has a shorter + // encoding. + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { - setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { - setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) @@ -390,13 +395,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } if (Subtarget->hasLZCNT()) { + // When promoting the i8 variants, force them to i32 for a shorter + // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); - if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } } if (Subtarget->hasPOPCNT()) { @@ -459,7 +478,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); @@ -538,14 +557,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); - else if (EnableSegmentedStacks) + else if (TM.Options.EnableSegmentedStacks) setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Expand); - if (!UseSoftFloat && X86ScalarSSEf64) { + if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -577,7 +596,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps - } else if (!UseSoftFloat && X86ScalarSSEf32) { + } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, X86::FR32RegisterClass); @@ -606,11 +625,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } - } else if (!UseSoftFloat) { + } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, X86::RFP64RegisterClass); @@ -621,7 +640,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64 , Expand); setOperationAction(ISD::FCOS , MVT::f64 , Expand); } @@ -640,7 +659,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. - if (!UseSoftFloat) { + if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, X86::RFP80RegisterClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -659,11 +678,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } - if (!UnsafeFPMath) { + if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80 , Expand); setOperationAction(ISD::FCOS , MVT::f80 , Expand); } + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } @@ -715,7 +739,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); @@ -749,7 +775,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!UseSoftFloat && Subtarget->hasMMX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -786,7 +812,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasXMM()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -803,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasXMMInt()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -909,7 +935,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); @@ -924,10 +950,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); - // Can turn SHL into an integer multiply. - setOperationAction(ISD::SHL, MVT::v4i32, Custom); - setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); @@ -948,30 +970,47 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + // FIXME: these should be Legal but thats only for the case where + // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } - if (Subtarget->hasXMMInt()) { - setOperationAction(ISD::SRL, MVT::v2i64, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Custom); - setOperationAction(ISD::SRL, MVT::v16i8, Custom); + if (Subtarget->hasSSE2()) { setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); - setOperationAction(ISD::SHL, MVT::v2i64, Custom); - setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRA, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Legal); + setOperationAction(ISD::SRL, MVT::v4i32, Legal); + + setOperationAction(ISD::SHL, MVT::v2i64, Legal); + setOperationAction(ISD::SHL, MVT::v4i32, Legal); + + setOperationAction(ISD::SRA, MVT::v4i32, Legal); + } else { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + } } - if (Subtarget->hasSSE42() || Subtarget->hasAVX()) + if (Subtarget->hasSSE42()) setOperationAction(ISD::SETCC, MVT::v2i64, Custom); - if (!UseSoftFloat && Subtarget->hasAVX()) { + if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); @@ -1008,18 +1047,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); - setOperationAction(ISD::SRL, MVT::v4i64, Custom); - setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); - setOperationAction(ISD::SHL, MVT::v4i64, Custom); - setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); - setOperationAction(ISD::SRA, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); + setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); @@ -1030,25 +1065,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); - setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); - setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + + if (Subtarget->hasAVX2()) { + setOperationAction(ISD::ADD, MVT::v4i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i32, Legal); + setOperationAction(ISD::ADD, MVT::v16i16, Legal); + setOperationAction(ISD::ADD, MVT::v32i8, Legal); + + setOperationAction(ISD::SUB, MVT::v4i64, Legal); + setOperationAction(ISD::SUB, MVT::v8i32, Legal); + setOperationAction(ISD::SUB, MVT::v16i16, Legal); + setOperationAction(ISD::SUB, MVT::v32i8, Legal); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Legal); + setOperationAction(ISD::MUL, MVT::v16i16, Legal); + // Don't lower v32i8 because there is no 128-bit byte mul + + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + setOperationAction(ISD::SRL, MVT::v4i64, Legal); + setOperationAction(ISD::SRL, MVT::v8i32, Legal); + + setOperationAction(ISD::SHL, MVT::v4i64, Legal); + setOperationAction(ISD::SHL, MVT::v8i32, Legal); + + setOperationAction(ISD::SRA, MVT::v8i32, Legal); + } else { + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul - setOperationAction(ISD::ADD, MVT::v4i64, Custom); - setOperationAction(ISD::ADD, MVT::v8i32, Custom); - setOperationAction(ISD::ADD, MVT::v16i16, Custom); - setOperationAction(ISD::ADD, MVT::v32i8, Custom); + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v4i64, Custom); - setOperationAction(ISD::SUB, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v16i16, Custom); - setOperationAction(ISD::SUB, MVT::v32i8, Custom); + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v4i64, Custom); - setOperationAction(ISD::MUL, MVT::v8i32, Custom); - setOperationAction(ISD::MUL, MVT::v16i16, Custom); - // Don't lower v32i8 because there is no 128-bit byte mul + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + } // Custom lower several nodes for 256-bit types. for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; @@ -1099,7 +1169,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // of this type with custom code. for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, + Custom); } // We want to custom lower some of our intrinsics. @@ -1137,7 +1208,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1152,9 +1222,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); + if (Subtarget->hasBMI()) + setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); @@ -1166,10 +1240,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; - setPrefLoopAlignment(16); + setPrefLoopAlignment(4); // 2^4 bytes. benefitFromCodePlacementOpt = true; - setPrefFunctionAlignment(4); + setPrefFunctionAlignment(4); // 2^4 bytes. } @@ -1219,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { } unsigned Align = 4; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } @@ -1230,7 +1304,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If -/// 'NonScalarIntSafe' is true, that means it's safe to return a +/// 'IsZeroVal' is true, that means it's safe to return a /// non-scalar-integer type, e.g. empty string source, constant, or loaded /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is /// constant so it does not need to be loaded. @@ -1239,31 +1313,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool NonScalarIntSafe, + bool IsZeroVal, bool MemcpyStrSrc, MachineFunction &MF) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); - if (NonScalarIntSafe && + if (IsZeroVal && !F->hasFnAttr(Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16))) && Subtarget->getStackAlignment() >= 16) { - if (Subtarget->hasAVX() && - Subtarget->getStackAlignment() >= 32) - return MVT::v8f32; - if (Subtarget->hasXMMInt()) + if (Subtarget->getStackAlignment() >= 32) { + if (Subtarget->hasAVX2()) + return MVT::v8i32; + if (Subtarget->hasAVX()) + return MVT::v8f32; + } + if (Subtarget->hasSSE2()) return MVT::v4i32; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->getStackAlignment() >= 8 && - Subtarget->hasXMMInt()) { + Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; @@ -1428,14 +1505,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasXMM())) { + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -1461,7 +1538,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } @@ -1501,15 +1578,21 @@ X86TargetLowering::LowerReturn(SDValue Chain, MVT::Other, &RetOps[0], RetOps.size()); } -bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; + SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); - if (Copy->getOpcode() != ISD::CopyToReg && - Copy->getOpcode() != ISD::FP_EXTEND) + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; @@ -1520,7 +1603,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { HasRet = true; } - return HasRet; + if (!HasRet) + return false; + + Chain = TCChain; + return true; } EVT @@ -1561,7 +1648,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -1651,7 +1738,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) { } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) + if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); @@ -1664,7 +1751,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, + bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } @@ -1678,7 +1766,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv, + getTargetMachine().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -1704,7 +1793,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), - false, false, 0); + false, false, false, 0); } } @@ -1728,6 +1817,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsWin64 = Subtarget->isTargetWin64(); assert(!(isVarArg && IsTailCallConvention(CallConv)) && @@ -1758,7 +1848,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - TargetRegisterClass *RC = NULL; + const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = X86::GR32RegisterClass; else if (Is64Bit && RegVT == MVT::i64) @@ -1807,7 +1897,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } @@ -1828,7 +1918,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv)) + if (FuncIsMadeTailCallSafe(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -1842,17 +1933,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays - static const unsigned GPR64ArgRegsWin64[] = { + static const uint16_t GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; - static const unsigned GPR64ArgRegs64Bit[] = { + static const uint16_t GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; - static const unsigned XMMArgRegs64Bit[] = { + static const uint16_t XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; - const unsigned *GPR64ArgRegs; + const uint16_t *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { @@ -1865,17 +1956,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumIntRegs = 6; TotalNumXMMRegs = 8; GPR64ArgRegs = GPR64ArgRegs64Bit; - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, + TotalNumXMMRegs); } unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasXMM()) && + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && + assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && + NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -1892,8 +1986,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so they - // may be loaded by deferencing the result of va_next. + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex( @@ -1953,12 +2047,14 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + ArgsAreStructReturn(Ins)) FuncInfo->setBytesToPopOnReturn(4); } @@ -2006,7 +2102,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } @@ -2033,7 +2129,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, SDValue X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, - bool &isTailCall, + bool doesNotRet, bool &isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -2042,9 +2138,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); + bool IsWindows = Subtarget->isTargetWindows(); bool IsStructRet = CallIsStructReturn(Outs); bool IsSibcall = false; + if (MF.getTarget().Options.DisableTailCalls) + isTailCall = false; + if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, @@ -2053,7 +2153,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!GuaranteedTailCallOpt && isTailCall) + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) @@ -2081,7 +2181,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + else if (getTargetMachine().Options.GuaranteedTailCallOpt && + IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2231,12 +2332,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. - static const unsigned XMMArgRegs[] = { + static const uint16_t XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - assert((Subtarget->hasXMM() || !NumXMMRegs) + assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, @@ -2260,7 +2361,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, int FI = 0; // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) @@ -2368,7 +2469,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ExtraLoad) Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), - false, false, 0); + false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -2421,6 +2522,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (Is64Bit && isVarArg && !IsWin64) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + if (InFlag.getNode()) Ops.push_back(InFlag); @@ -2440,12 +2547,15 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush; - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + getTargetMachine().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows && + IsStructRet) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPush = 4; else NumBytesForCalleeToPush = 0; // Callee pops nothing. @@ -2598,7 +2708,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; - if (GuaranteedTailCallOpt) { + if (getTargetMachine().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; @@ -2641,9 +2751,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; } - // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. - // Therefore if it's not used by the call it is not safe to optimize this into - // a sibcall. + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { @@ -2785,9 +2895,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::SHUFPD: + case X86ISD::SHUFP: case X86ISD::PALIGN: - case X86ISD::SHUFPS: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: @@ -2798,34 +2907,16 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: return true; } - return false; } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, SelectionDAG &DAG) { + SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVSHDUP: @@ -2833,39 +2924,32 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { + SDValue V1, unsigned TargetMask, + SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: + case X86ISD::VPERMILP: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } - - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, - SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { + SDValue V1, SDValue V2, unsigned TargetMask, + SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGN: - case X86ISD::SHUFPD: - case X86ISD::SHUFPS: - case X86ISD::VPERM2F128: + case X86ISD::SHUFP: + case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } - return SDValue(); } static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, @@ -2879,25 +2963,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } - return SDValue(); } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { @@ -3092,17 +3161,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrInRange - Return true if every element in Mask, begining -/// from position Pos and ending in Pos+Size, falls within the specified -/// range (L, L+Pos]. or is undef. -static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, - int Pos, int Size, int Low, int Hi) { - for (int i = Pos, e = Pos+Size; i != e; ++i) - if (!isUndefOrInRange(Mask[i], Low, Hi)) - return false; - return true; -} - /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { @@ -3114,7 +3172,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, begining /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. -static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, +static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, int Pos, int Size, int Low) { for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) @@ -3125,7 +3183,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. -static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -3133,302 +3191,188 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { return false; } -bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFDMask(M, N->getValueType(0)); -} - /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. -static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Lower quadword copied in order or undef. - for (int i = 0; i != 4; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) + return false; // Upper quadword shuffled. - for (int i = 4; i != 8; ++i) + for (unsigned i = 4; i != 8; ++i) if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) return false; return true; } -bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFHWMask(M, N->getValueType(0)); -} - /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. -static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) { if (VT != MVT::v8i16) return false; // Upper quadword copied in order. - for (int i = 4; i != 8; ++i) - if (Mask[i] >= 0 && Mask[i] != i) - return false; + if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) + return false; // Lower quadword shuffled. - for (int i = 0; i != 4; ++i) + for (unsigned i = 0; i != 4; ++i) if (Mask[i] >= 4) return false; return true; } -bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isPSHUFLWMask(M, N->getValueType(0)); -} - /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. -static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool hasSSSE3OrAVX) { - int i, e = VT.getVectorNumElements(); - if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) +static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) || + (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())) return false; - // Do not handle v2i64 / v2f64 shuffles with palignr. - if (e < 4 || !hasSSSE3OrAVX) + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + // Do not handle 64-bit element shuffles with palignr. + if (NumLaneElts == 2) return false; - for (i = 0; i != e; ++i) - if (Mask[i] >= 0) - break; + for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { + unsigned i; + for (i = 0; i != NumLaneElts; ++i) { + if (Mask[i+l] >= 0) + break; + } - // All undef, not a palignr. - if (i == e) - return false; + // Lane is all undef, go to next lane + if (i == NumLaneElts) + continue; - // Make sure we're shifting in the right direction. - if (Mask[i] <= i) - return false; + int Start = Mask[i+l]; - int s = Mask[i] - i; + // Make sure its in this lane in one of the sources + if (!isUndefOrInRange(Start, l, l+NumLaneElts) && + !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) + return false; - // Check the rest of the elements to see if they are consecutive. - for (++i; i != e; ++i) { - int m = Mask[i]; - if (m >= 0 && m != s+i) + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) return false; - } - return true; -} -/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPSY. -static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); + // Correct second source to be contiguous with first source + if (Start >= (int)NumElts) + Start -= NumElts - NumLaneElts; - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) - return false; + // Make sure we're shifting in the right direction. + if (Start <= (int)(i+l)) + return false; - if (NumElems != 8) - return false; + Start -= i; - // VSHUFPSY divides the resulting vector into 4 chunks. - // The sources are also splitted into 4 chunks, and each destination - // chunk must come from a different source chunk. - // - // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 - // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 - // - // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, - // Y3..Y0, Y3..Y0, X3..X0, X3..X0 - // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; + // Check the rest of the elements to see if they are consecutive. + for (++i; i != NumLaneElts; ++i) { + int Idx = Mask[i+l]; - // The mask of the second half must be the same as the first but with - // the appropriate offsets. This works in the same way as VPERMILPS - // works with masks. - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; - } - for (int i = QuarterSize*3; i < NumElems; ++i) { - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - int FstHalfIdx = i-HalfSize; - if (Mask[FstHalfIdx] < 0) - continue; - if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) - return false; + // Make sure its in this lane + if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && + !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) + return false; + + // If not lane 0, then we must match lane 0 + if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) + return false; + + if (Idx >= (int)NumElts) + Idx -= NumElts - NumLaneElts; + if (!isUndefOrEqual(Idx, Start+i)) + return false; + + } } return true; } -/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPSY instruction. -static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 8 && VT.getSizeInBits() == 256 && - "Only supports v8i32 and v8f32 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, + unsigned NumElems) { + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) continue; - // The mask of the first half must be equal to the second one. - unsigned Shamt = (i%HalfSize)*2; - unsigned Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << Shamt; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; } - - return Mask; } -/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 256-bit -/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS -/// version and the mask of the second half isn't binded with the first -/// one. -static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElems = VT.getVectorNumElements(); - - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128/256-bit +/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be +/// reverse of what x86 shuffles want. +static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX, + bool Commuted = false) { + if (!HasAVX && VT.getSizeInBits() == 256) return false; - if (NumElems != 4) + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElems = NumElems/NumLanes; + + if (NumLaneElems != 2 && NumLaneElems != 4) return false; // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + // VSHUFPDY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // - // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // - int QuarterSize = NumElems/4; - int HalfSize = QuarterSize*2; - for (int i = 0; i < QuarterSize; ++i) - if (!isUndefOrInRange(Mask[i], 0, HalfSize)) - return false; - for (int i = QuarterSize; i < QuarterSize*2; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) - return false; - for (int i = QuarterSize*2; i < QuarterSize*3; ++i) - if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) - return false; - for (int i = QuarterSize*3; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) - return false; - - return true; -} - -/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VSHUFPDY instruction. -static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - - assert(NumElems == 4 && VT.getSizeInBits() == 256 && - "Only supports v4i64 and v4f64 types"); - - int HalfSize = NumElems/2; - unsigned Mask = 0; - for (int i = 0; i != NumElems ; ++i) { - if (SVOp->getMaskElt(i) < 0) - continue; - int Elt = SVOp->getMaskElt(i) % HalfSize; - Mask |= Elt << i; + unsigned HalfLaneElems = NumLaneElems/2; + for (unsigned l = 0; l != NumElems; l += NumLaneElems) { + for (unsigned i = 0; i != NumLaneElems; ++i) { + int Idx = Mask[i+l]; + unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0); + if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems)) + return false; + // For VSHUFPSY, the mask of the second half must be the same as the + // first but with the appropriate offsets. This works in the same way as + // VPERMILPS works with masks. + if (NumElems != 8 || l == 0 || Mask[i] < 0) + continue; + if (!isUndefOrEqual(Idx, Mask[i]+l)) + return false; + } } - return Mask; -} - -/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to 128-bit -/// SHUFPS and SHUFPD. -static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (VT.getSizeInBits() != 128) - return false; - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - - return true; -} - -bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isSHUFPMask(M, N->getValueType(0)); -} - -/// isCommutedSHUFP - Returns true if the shuffle mask is exactly -/// the reverse of what x86 shuffles want. x86 shuffles requires the lower -/// half elements to come from vector 1 (which would equal the dest.) and -/// the upper half to come from vector 2. -static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - - if (NumElems != 2 && NumElems != 4) - return false; - - int Half = NumElems / 2; - for (int i = 0; i < Half; ++i) - if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) - return false; - for (int i = Half; i < NumElems; ++i) - if (!isUndefOrInRange(Mask[i], 0, NumElems)) - return false; return true; } -static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedSHUFPMask(M, N->getValueType(0)); -} - /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVHLPS. -bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3438,17 +3382,16 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 - return isUndefOrEqual(N->getMaskElt(0), 6) && - isUndefOrEqual(N->getMaskElt(1), 7) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 6) && + isUndefOrEqual(Mask[1], 7) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> -bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); +static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) { unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) @@ -3457,26 +3400,29 @@ bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { if (NumElems != 4) return false; - return isUndefOrEqual(N->getMaskElt(0), 2) && - isUndefOrEqual(N->getMaskElt(1), 3) && - isUndefOrEqual(N->getMaskElt(2), 2) && - isUndefOrEqual(N->getMaskElt(3), 3); + return isUndefOrEqual(Mask[0], 2) && + isUndefOrEqual(Mask[1], 3) && + isUndefOrEqual(Mask[2], 2) && + isUndefOrEqual(Mask[3], 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. -bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) { + if (VT.getSizeInBits() != 128) + return false; + + unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; - for (unsigned i = NumElems/2; i < NumElems; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = NumElems/2; i != NumElems; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; return true; @@ -3484,19 +3430,19 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. -bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { - unsigned NumElems = N->getValueType(0).getVectorNumElements(); +static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) { + unsigned NumElems = VT.getVectorNumElements(); if ((NumElems != 2 && NumElems != 4) - || N->getValueType(0).getSizeInBits() > 128) + || VT.getSizeInBits() > 128) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (unsigned i = 0; i < NumElems/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems)) return false; return true; @@ -3504,14 +3450,15 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. -static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); +static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3519,11 +3466,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = Start, j = s * NumLaneElts; - i != End; + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3537,30 +3482,22 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); -} - /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. -static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, - bool V2IsSplat = false) { - int NumElts = VT.getVectorNumElements(); +static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2, bool V2IsSplat = false) { + unsigned NumElts = VT.getVectorNumElements(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); - if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -3568,11 +3505,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; - unsigned Start = 0; - unsigned End = NumLaneElts; for (unsigned l = 0; l != NumLanes; ++l) { - for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; - i != End; i += 2, ++j) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; if (!isUndefOrEqual(BitI, j)) @@ -3585,42 +3520,39 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, return false; } } - // Process the next 128 bits. - Start += NumLaneElts; - End += NumLaneElts; } return true; } -bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); -} - /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> -static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, + bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. - if (NumElems == 4 && VT.getSizeInBits() == 256) + if (NumElts == 4 && VT.getSizeInBits() == 256) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. - unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumLaneElts = NumElems / NumLanes; + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; - for (unsigned s = 0; s < NumLanes; ++s) { - for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; - i != NumLaneElts * (s + 1); + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = l*NumLaneElts; + i != (l+1)*NumLaneElts; i += 2, ++j) { int BitI = Mask[i]; int BitI1 = Mask[i+1]; @@ -3635,81 +3567,77 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { return true; } -bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); -} - /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> -static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { - int NumElems = VT.getVectorNumElements(); - if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) +static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) { + unsigned NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 && + (!HasAVX2 || (NumElts != 16 && NumElts != 32))) return false; - for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2; + i != (l+1)*NumLaneElts; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } return true; } -bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); -} - /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. -static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { +static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; + if (VT.getSizeInBits() == 256) + return false; - int NumElts = VT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); if (!isUndefOrEqual(Mask[0], NumElts)) return false; - for (int i = 1; i < NumElts; ++i) + for (unsigned i = 1; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } -bool X86::isMOVLMask(ShuffleVectorSDNode *N) { - SmallVector<int, 8> M; - N->getMask(M); - return ::isMOVLMask(M, N->getValueType(0)); -} - -/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. -static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) +static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + if (!HasAVX || VT.getSizeInBits() != 256) return false; // The shuffle result is divided into half A and half B. In total the two // sources have 4 halves, namely: C, D, E, F. The final values of A and // B must come from C, D, E or F. - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; bool MatchA = false, MatchB = false; // Check if A comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { MatchA = true; break; @@ -3717,7 +3645,7 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, } // Check if B comes from one of C, D, E, F. - for (int Half = 0; Half < 4; ++Half) { + for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { MatchB = true; break; @@ -3727,22 +3655,21 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, return MatchA && MatchB; } -/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERM2F128 instructions. -static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. +static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { EVT VT = SVOp->getValueType(0); - int HalfSize = VT.getVectorNumElements()/2; + unsigned HalfSize = VT.getVectorNumElements()/2; - int FstHalf = 0, SndHalf = 0; - for (int i = 0; i < HalfSize; ++i) { + unsigned FstHalf = 0, SndHalf = 0; + for (unsigned i = 0; i < HalfSize; ++i) { if (SVOp->getMaskElt(i) > 0) { FstHalf = SVOp->getMaskElt(i)/HalfSize; break; } } - for (int i = HalfSize; i < HalfSize*2; ++i) { + for (unsigned i = HalfSize; i < HalfSize*2; ++i) { if (SVOp->getMaskElt(i) > 0) { SndHalf = SVOp->getMaskElt(i)/HalfSize; break; @@ -3752,141 +3679,56 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { return (FstHalf | (SndHalf << 4)); } -/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) - return false; - - // Only match 256-bit with 64-bit types - if (VT.getSizeInBits() != 256 || NumElts != 4) +/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. +static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + if (!HasAVX) return false; - // The mask on the high lane is independent of the low. Both can match - // any element in inside its own lane, but can't cross. - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int LaneStart = l*LaneSize; - if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) - return false; - } - - return true; -} - -/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand -/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. -/// Note that VPERMIL mask matching is different depending whether theunderlying -/// type is 32 or 64. In the VPERMILPS the high half of the mask should point -/// to the same elements of the low, but to the higher half of the source. -/// In VPERMILPD the two lanes could be shuffled independently of each other -/// with the same restriction that lanes can't be crossed. -static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, - const X86Subtarget *Subtarget) { unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.getSizeInBits()/128; - - if (!Subtarget->hasAVX()) + // Only match 256-bit with 32/64-bit types + if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8)) return false; - // Only match 256-bit with 32-bit types - if (VT.getSizeInBits() != 256 || NumElts != 8) - return false; - - // The mask on the high lane should be the same as the low. Actually, - // they can differ if any of the corresponding index in a lane is undef - // and the other stays in range. - int LaneSize = NumElts/NumLanes; - for (int i = 0; i < LaneSize; ++i) { - int HighElt = i+LaneSize; - bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); - bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); - - if (!HighValid || !LowValid) - return false; - if (Mask[i] < 0 || Mask[HighElt] < 0) - continue; - if (Mask[HighElt]-Mask[i] != LaneSize) - return false; - } - - return true; -} - -/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPS* instructions. -static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - int LaneSize = NumElts/NumLanes; - - // Although the mask is equal for both lanes do it twice to get the cases - // where a mask will match because the same mask element is undef on the - // first half but valid on the second. This would get pathological cases - // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. - unsigned Mask = 0; - for (int l = 0; l < NumLanes; ++l) { - for (int i = 0; i < LaneSize; ++i) { - int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); - if (MaskElt < 0) + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned LaneSize = NumElts/NumLanes; + for (unsigned l = 0; l != NumElts; l += LaneSize) { + for (unsigned i = 0; i != LaneSize; ++i) { + if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) + return false; + if (NumElts != 8 || l == 0) continue; - if (MaskElt >= LaneSize) - MaskElt -= LaneSize; - Mask |= MaskElt << (i*2); - } - } - - return Mask; -} - -/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_MASK mask with VPERMILPD* instructions. -static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VT = SVOp->getValueType(0); - - int NumElts = VT.getVectorNumElements(); - int NumLanes = VT.getSizeInBits()/128; - - unsigned Mask = 0; - int LaneSize = NumElts/NumLanes; - for (int l = 0; l < NumLanes; ++l) - for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { - int MaskElt = SVOp->getMaskElt(i); - if (MaskElt < 0) + // VPERMILPS handling + if (Mask[i] < 0) continue; - Mask |= (MaskElt-l*LaneSize) << i; + if (!isUndefOrEqual(Mask[i+l], Mask[i]+l)) + return false; } + } - return Mask; + return true; } -/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse +/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. -static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, +static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { - int NumOps = VT.getVectorNumElements(); + unsigned NumOps = VT.getVectorNumElements(); + if (VT.getSizeInBits() == 256) + return false; if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; if (!isUndefOrEqual(Mask[0], 0)) return false; - for (int i = 1; i < NumOps; ++i) + for (unsigned i = 1; i != NumOps; ++i) if (!(isUndefOrEqual(Mask[i], i+NumOps) || (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) @@ -3895,26 +3737,14 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, return true; } -static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, - bool V2IsUndef = false) { - SmallVector<int, 8> M; - N->getMask(M); - return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); -} - /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> -bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3922,9 +3752,9 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, return false; // "i+1" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i+1) || - !isUndefOrEqual(N->getMaskElt(i+1), i+1)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i+1) || + !isUndefOrEqual(Mask[i+1], i+1)) return false; return true; @@ -3933,16 +3763,11 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> -bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) - return false; - - // The second vector must be undef - if (N->getOperand(1).getOpcode() != ISD::UNDEF) +static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3()) return false; - EVT VT = N->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); if ((VT.getSizeInBits() == 128 && NumElems != 4) || @@ -3950,9 +3775,9 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, return false; // "i" is the value the indexed mask element must have - for (unsigned i = 0; i < NumElems; i += 2) - if (!isUndefOrEqual(N->getMaskElt(i), i) || - !isUndefOrEqual(N->getMaskElt(i+1), i)) + for (unsigned i = 0; i != NumElems; i += 2) + if (!isUndefOrEqual(Mask[i], i) || + !isUndefOrEqual(Mask[i+1], i)) return false; return true; @@ -3961,21 +3786,17 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. -static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, - const X86Subtarget *Subtarget) { - EVT VT = N->getValueType(0); - int NumElts = VT.getVectorNumElements(); - bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; +static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) { + unsigned NumElts = VT.getVectorNumElements(); - if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || - !V2IsUndef || NumElts != 4) + if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4) return false; - for (int i = 0; i != NumElts/2; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), 0)) + for (unsigned i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(Mask[i], 0)) return false; - for (int i = NumElts/2; i != NumElts; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + for (unsigned i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } @@ -3983,18 +3804,16 @@ static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. -bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { - EVT VT = N->getValueType(0); - +static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; - int e = VT.getVectorNumElements() / 2; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(i), i)) + unsigned e = VT.getVectorNumElements() / 2; + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[i], i)) return false; - for (int i = 0; i < e; ++i) - if (!isUndefOrEqual(N->getMaskElt(e+i), i)) + for (unsigned i = 0; i != e; ++i) + if (!isUndefOrEqual(Mask[e+i], i)) return false; return true; } @@ -4039,31 +3858,43 @@ bool X86::isVINSERTF128Index(SDNode *N) { /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. -unsigned X86::getShuffleSHUFImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - int NumOperands = SVOp->getValueType(0).getVectorNumElements(); +/// Handles 128-bit and 256-bit. +static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for PSHUF/SHUFP"); - unsigned Shift = (NumOperands == 4) ? 2 : 1; + // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate + // independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + assert((NumLaneElts == 2 || NumLaneElts == 4) && + "Only supports 2 or 4 elements per lane"); + + unsigned Shift = (NumLaneElts == 4) ? 1 : 0; unsigned Mask = 0; - for (int i = 0; i < NumOperands; ++i) { - int Val = SVOp->getMaskElt(NumOperands-i-1); - if (Val < 0) Val = 0; - if (Val >= NumOperands) Val -= NumOperands; - Mask |= Val; - if (i != NumOperands - 1) - Mask <<= Shift; + for (unsigned i = 0; i != NumElts; ++i) { + int Elt = N->getMaskElt(i); + if (Elt < 0) continue; + Elt %= NumLaneElts; + unsigned ShAmt = i << Shift; + if (ShAmt >= 8) ShAmt -= 8; + Mask |= Elt << ShAmt; } + return Mask; } /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. -unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the last 4. for (unsigned i = 7; i >= 4; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= (Val - 4); if (i != 4) @@ -4074,12 +3905,11 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. -unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); +static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { unsigned Mask = 0; // 8 nodes, but we only care about the first 4. for (int i = 3; i >= 0; --i) { - int Val = SVOp->getMaskElt(i); + int Val = N->getMaskElt(i); if (Val >= 0) Mask |= Val; if (i != 0) @@ -4090,18 +3920,24 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - EVT VVT = N->getValueType(0); - unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; - int Val = 0; +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; - unsigned i, e; - for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { + int Val = 0; + unsigned i; + for (i = 0; i != NumElts; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; } + if (Val >= (int)NumElts) + Val -= NumElts - NumLaneElts; + assert(Val - i > 0 && "PALIGNR imm should be positive"); return (Val - i) * EltSize; } @@ -4170,36 +4006,20 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SVOp->getOperand(0), &MaskVec[0]); } -/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming -/// the two vector operands have swapped position. -static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { - unsigned NumElems = VT.getVectorNumElements(); - for (unsigned i = 0; i != NumElems; ++i) { - int idx = Mask[i]; - if (idx < 0) - continue; - else if (idx < (int)NumElems) - Mask[i] = idx + NumElems; - else - Mask[i] = idx - NumElems; - } -} - /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). -static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); +static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) + if (!isUndefOrEqual(Mask[i], i+2)) return false; for (unsigned i = 2; i != 4; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) + if (!isUndefOrEqual(Mask[i], i+4)) return false; return true; } @@ -4218,14 +4038,36 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { return true; } +// Test whether the given value is a vector value which will be legalized +// into a load. +static bool WillBeConstantPoolLoad(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + // Check for any non-constant elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + switch (N->getOperand(i).getNode()->getOpcode()) { + case ISD::UNDEF: + case ISD::ConstantFP: + case ISD::Constant: + break; + default: + return false; + } + + // Vectors of all-zeros and all-ones are materialized with special + // instructions rather than being loaded. + return !ISD::isBuildVectorAllZeros(N) && + !ISD::isBuildVectorAllOnes(N); +} + /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to /// match movlp{s|d}. The lower half elements should come from lower half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, - ShuffleVectorSDNode *Op) { - EVT VT = Op->getValueType(0); + ArrayRef<int> Mask, EVT VT) { if (VT.getSizeInBits() != 128) return false; @@ -4233,7 +4075,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, return false; // Is V2 is a vector load, don't do this transformation. We will try to use // load folding shufps op. - if (ISD::isNON_EXTLoad(V2)) + if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) return false; unsigned NumElems = VT.getVectorNumElements(); @@ -4241,10 +4083,10 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i)) + if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = NumElems/2; i != NumElems; ++i) - if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) + if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; } @@ -4292,15 +4134,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) { /// getZeroVector - Returns a vector of specified type with all zero elements. /// -static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, - DebugLoc dl) { +static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.getSizeInBits() == 128) { // SSE - if (HasXMMInt) { // SSE2 + if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 @@ -4308,34 +4150,46 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.getSizeInBits() == 256) { // AVX - // 256-bit logic and arithmetic instructions in AVX are - // all floating-point, no support for integer ops. Default - // to emitting fp zeroed vectors then. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + if (Subtarget->hasAVX2()) { // AVX2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { + // 256-bit logic and arithmetic instructions in AVX are all + // floating-point, no support for integer ops. Emit fp zeroed vectors. + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + } } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32>. For 256-bit types, use two -/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their -/// original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, + DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected a 128-bit or 256-bit vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, - Cst, Cst, Cst, Cst); - - if (VT.is256BitVector()) { - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + SDValue Vec; + if (VT.getSizeInBits() == 256) { + if (HasAVX2) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + } else { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); @@ -4343,24 +4197,12 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements /// that point to V2 points to its first element. -static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - bool Changed = false; - SmallVector<int, 8> MaskVec; - SVOp->getMask(MaskVec); - +static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { - if (MaskVec[i] > (int)NumElems) { - MaskVec[i] = NumElems; - Changed = true; + if (Mask[i] > (int)NumElems) { + Mask[i] = NumElems; } } - if (Changed) - return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), - SVOp->getOperand(1), &MaskVec[0]); - return SDValue(SVOp, 0); } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd @@ -4464,7 +4306,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Size == 256) { - unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; + unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0; V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); if (Idx > 0) EltNo -= NumElems/2; @@ -4496,11 +4338,12 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, - bool isZero, bool HasXMMInt, + bool IsZero, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { EVT VT = V2.getValueType(); - SDValue V1 = isZero - ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + SDValue V1 = IsZero + ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 16> MaskVec; for (unsigned i = 0; i != NumElems; ++i) @@ -4509,9 +4352,81 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); } +/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the +/// target specific opcode. Returns true if the Mask could be calculated. +/// Sets IsUnary to true if only uses one source. +static bool getTargetShuffleMask(SDNode *N, EVT VT, + SmallVectorImpl<int> &Mask, bool &IsUnary) { + unsigned NumElems = VT.getVectorNumElements(); + SDValue ImmN; + + IsUnary = false; + switch(N->getOpcode()) { + case X86ISD::SHUFP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, Mask); + break; + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, Mask); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, Mask); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, Mask); + break; + case X86ISD::PSHUFD: + case X86ISD::VPERMILP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: { + // The index 0 always comes from the first element of the second source, + // this is why MOVSS and MOVSD are used in the first place. The other + // elements come from the other positions of the first source vector + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) { + Mask.push_back(i); + } + break; + } + case X86ISD::VPERM2X128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::MOVDDUP: + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::PALIGN: + // Not yet implemented + return false; + default: llvm_unreachable("unknown target shuffle node"); + } + + return true; +} + /// getShuffleScalarElt - Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. -static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, +static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. @@ -4522,129 +4437,34 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { - Index = SV->getMaskElt(Index); + int Elt = SV->getMaskElt(Index); - if (Index < 0) + if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); - int NumElems = VT.getVectorNumElements(); - SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); - return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); + unsigned NumElems = VT.getVectorNumElements(); + SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) + : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - int NumElems = VT.getVectorNumElements(); - SmallVector<unsigned, 16> ShuffleMask; + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> ShuffleMask; SDValue ImmN; + bool IsUnary; - switch(Opcode) { - case X86ISD::SHUFPS: - case X86ISD::SHUFPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeSHUFPSMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - DecodePUNPCKHMask(NumElems, ShuffleMask); - break; - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - DecodeUNPCKHPMask(NumElems, ShuffleMask); - break; - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(VT, ShuffleMask); - break; - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: - DecodeUNPCKLPMask(VT, ShuffleMask); - break; - case X86ISD::MOVHLPS: - DecodeMOVHLPSMask(NumElems, ShuffleMask); - break; - case X86ISD::MOVLHPS: - DecodeMOVLHPSMask(NumElems, ShuffleMask); - break; - case X86ISD::PSHUFD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFMask(NumElems, - cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PSHUFHW: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::PSHUFLW: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::MOVSS: - case X86ISD::MOVSD: { - // The index 0 always comes from the first element of the second source, - // this is why MOVSS and MOVSD are used in the first place. The other - // elements come from the other positions of the first source vector. - unsigned OpNum = (Index == 0) ? 1 : 0; - return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, - Depth+1); - } - case X86ISD::VPERMILPS: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPSY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPD: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERMILPDY: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::VPERM2F128: - ImmN = N->getOperand(N->getNumOperands()-1); - DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), - ShuffleMask); - break; - case X86ISD::MOVDDUP: - case X86ISD::MOVLHPD: - case X86ISD::MOVLPD: - case X86ISD::MOVLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: - case X86ISD::PALIGN: - return SDValue(); // Not yet implemented. - default: - assert(0 && "unknown target shuffle node"); + if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary)) return SDValue(); - } - Index = ShuffleMask[Index]; - if (Index < 0) + int Elt = ShuffleMask[Index]; + if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); - SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); - return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, + SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) + : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -4660,7 +4480,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) - : DAG.getUNDEF(VT.getVectorElementType()); + : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); @@ -4672,38 +4492,37 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. static -unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, +unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, bool ZerosFromLeft, SelectionDAG &DAG) { - int i = 0; - - while (i < NumElems) { + unsigned i; + for (i = 0; i != NumElems; ++i) { unsigned Index = ZerosFromLeft ? i : NumElems-i-1; - SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); + SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); if (!(Elt.getNode() && (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) break; - ++i; } return i; } -/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to -/// MaskE correspond consecutively to elements from one of the vector operands, +/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) +/// correspond consecutively to elements from one of the vector operands, /// starting from its index OpIdx. Also tell OpNum which source vector operand. static -bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, - int OpIdx, int NumElems, unsigned &OpNum) { +bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, + unsigned MaskI, unsigned MaskE, unsigned OpIdx, + unsigned NumElems, unsigned &OpNum) { bool SeenV1 = false; bool SeenV2 = false; - for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { + for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { int Idx = SVOp->getMaskElt(i); // Ignore undef indicies if (Idx < 0) continue; - if (Idx < NumElems) + if (Idx < (int)NumElems) SeenV1 = true; else SeenV2 = true; @@ -4738,7 +4557,7 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, // if (!isShuffleMaskConsecutive(SVOp, 0, // Mask Start Index - NumElems-NumZeros-1, // Mask End Index + NumElems-NumZeros, // Mask End Index(exclusive) NumZeros, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? @@ -4771,7 +4590,7 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, // if (!isShuffleMaskConsecutive(SVOp, NumZeros, // Mask Start Index - NumElems-1, // Mask End Index + NumElems, // Mask End Index(exclusive) 0, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? @@ -4804,6 +4623,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); @@ -4815,7 +4635,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4851,6 +4671,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); @@ -4863,7 +4684,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (isNonZero) { if (First) { if (NumZero) - V = getZeroVector(MVT::v8i16, true, DAG, dl); + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; @@ -4884,7 +4705,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, const TargetLowering &TLI, DebugLoc dl) { assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; - unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, @@ -4952,21 +4773,16 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, int EltNo = (Offset - StartOffset) >> 2; int NumElems = VT.getVectorNumElements(); - EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), - false, false, 0); + false, false, false, 0); - // Canonicalize it to a v4i32 or v8i32 shuffle. SmallVector<int, 8> Mask; for (int i = 0; i < NumElems; ++i) Mask.push_back(EltNo); - V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); - return DAG.getNode(ISD::BITCAST, dl, NVT, - DAG.getVectorShuffle(CanonVT, dl, V1, - DAG.getUNDEF(CanonVT),&Mask[0])); + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); @@ -5021,11 +4837,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), 0); return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->getAlignment()); + LDBase->isInvariant(), LDBase->getAlignment()); } else if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); @@ -5041,6 +4858,137 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return SDValue(); } +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, +/// or SDValue() otherwise. +SDValue +X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { + if (!Subtarget->hasAVX()) + return SDValue(); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + SDValue Ld; + bool ConstSplatVal; + + switch (Op.getOpcode()) { + default: + // Unknown pattern found. + return SDValue(); + + case ISD::BUILD_VECTOR: { + // The BUILD_VECTOR node must be a splat. + if (!isSplatVector(Op.getNode())) + return SDValue(); + + Ld = Op.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The suspected load node has several users. Make sure that all + // of its users are from the BUILD_VECTOR node. + // Constants may have multiple users. + if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0)) + return SDValue(); + break; + } + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + + // Shuffles must have a splat mask where the first element is + // broadcasted. + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + return SDValue(); + + SDValue Sc = Op.getOperand(0); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR) + return SDValue(); + + Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The scalar_to_vector node and the suspected + // load node must have exactly one user. + // Constants may have multiple users. + if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse())) + return SDValue(); + break; + } + } + + bool Is256 = VT.getSizeInBits() == 256; + bool Is128 = VT.getSizeInBits() == 128; + + // Handle the broadcasting a single constant scalar from the constant pool + // into a vector. On Sandybridge it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + if (ConstSplatVal && Subtarget->hasAVX2()) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + unsigned ScalarSize = CVT.getSizeInBits(); + + if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) || + (Is128 && (ScalarSize == 32))) { + + const Constant *C = 0; + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + SDValue CP = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(), + false, false, false, Alignment); + + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + + // The scalar source must be a normal load. + if (!ISD::isNormalLoad(Ld.getNode())) + return SDValue(); + + // Reject loads that have uses of the chain result + if (Ld->hasAnyUseOfValue(1)) + return SDValue(); + + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + + // VBroadcast to YMM + if (Is256 && (ScalarSize == 32 || ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 32)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The integer check is needed for the 64-bit into 128-bit so it doesn't match + // double since there is vbroadcastsd xmm + if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) { + // VBroadcast to YMM + if (Is256 && (ScalarSize == 8 || ScalarSize == 16)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // VBroadcast to XMM + if (Is128 && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + + // Unsupported broadcast. + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -5053,22 +5001,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (Op.getValueType() == MVT::v4i32 || - Op.getValueType() == MVT::v8i32) + if (VT == MVT::v4i32 || VT == MVT::v8i32) return Op; - return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width - // vectors or broken into v4i32 operations on 256-bit vectors. + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (Op.getValueType() == MVT::v4i32) + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2())) return Op; - return getOnesVector(Op.getValueType(), DAG, dl); + return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl); } + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; + unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5118,8 +5070,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. @@ -5132,7 +5083,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(Item.getValueType()), &Mask[0]); } - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5141,21 +5092,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { - if (NumZero == 0) { + if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, + Item, DAG.getIntPtrConstant(0)); + } + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. - return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), - DAG); - } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); - EVT MiddleVT = MVT::v4i32; - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, - Subtarget->hasXMMInt(), DAG); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + if (VT.getSizeInBits() == 256) { + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32), + DAG, dl); + } else { + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -5183,8 +5146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a shuffle of zero and zero-extended scalar to vector. - Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, - Subtarget->hasXMMInt(), DAG); + Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; for (unsigned i = 0; i < NumElems; i++) MaskVec.push_back(i == Idx ? 0 : 1); @@ -5214,9 +5176,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. - if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { + if (VT.getSizeInBits() == 256) { SmallVector<SDValue, 32> V; - for (unsigned i = 0; i < NumElems; ++i) + for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); @@ -5240,8 +5202,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned Idx = CountTrailingZeros_32(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); - return getShuffleVectorZeroOrUndef(V2, Idx, true, - Subtarget->hasXMMInt(), DAG); + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } @@ -5249,24 +5210,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - *this); + Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits, turn it into a number of shuffles. - SmallVector<SDValue, 8> V; - V.resize(NumElems); + SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) - V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } @@ -5289,13 +5249,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } } - SmallVector<int, 8> MaskVec; - bool Reverse = (NonZeros & 0x3) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i : i); - Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; - for (unsigned i = 0; i < 2; ++i) - MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); + bool Reverse1 = (NonZeros & 0x3) == 2; + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; + int MaskVec[] = { + Reverse1 ? 1 : 0, + Reverse1 ? 0 : 1, + static_cast<int>(Reverse2 ? NumElems+1 : NumElems), + static_cast<int>(Reverse2 ? NumElems : NumElems+1) + }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } @@ -5310,7 +5271,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return LD; // For SSE 4.1, use insertps to put the high elements into the low element. - if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) { + if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -5422,6 +5383,85 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return LowerAVXCONCAT_VECTORS(Op, DAG); } +// Try to lower a shuffle node into a simple blend instruction. +static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = Op.getValueType(); + EVT InVT = V1.getValueType(); + int MaskSize = VT.getVectorNumElements(); + int InSize = InVT.getVectorNumElements(); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (MaskSize != InSize) + return SDValue(); + + int ISDNo = 0; + MVT OpTy; + + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i16: + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v8i16; + break; + case MVT::v4i32: + case MVT::v4f32: + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v4f32; + break; + case MVT::v2i64: + case MVT::v2f64: + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v2f64; + break; + case MVT::v8i32: + case MVT::v8f32: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPS; + OpTy = MVT::v8f32; + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget->hasAVX()) + return SDValue(); + ISDNo = X86ISD::BLENDPD; + OpTy = MVT::v4f64; + break; + case MVT::v16i16: + if (!Subtarget->hasAVX2()) + return SDValue(); + ISDNo = X86ISD::BLENDPW; + OpTy = MVT::v16i16; + break; + } + assert(ISDNo && "Invalid Op Number"); + + unsigned MaskVals = 0; + + for (int i = 0; i < MaskSize; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if (EltIdx == i || EltIdx == -1) + MaskVals |= (1<<i); + else if (EltIdx == (i + MaskSize)) + continue; // Bit is set to zero; + else return SDValue(); + } + + V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2); + SDValue Ret = DAG.getNode(ISDNo, dl, OpTy, V1, V2, + DAG.getConstant(MaskVals, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb @@ -5439,11 +5479,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // Determine if more than 1 of the words in each of the low and high quadwords // of the result come from the same quadword of one of the two inputs. Undef // mask values count as coming from any quadword, for better codegen. - SmallVector<unsigned, 4> LoQuad(4); - SmallVector<unsigned, 4> HiQuad(4); - BitVector InputQuads(4); + unsigned LoQuad[] = { 0, 0, 0, 0 }; + unsigned HiQuad[] = { 0, 0, 0, 0 }; + std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { - SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; + unsigned *Quad = i < 4 ? LoQuad : HiQuad; int EltIdx = SVOp->getMaskElt(i); MaskVals.push_back(EltIdx); if (EltIdx < 0) { @@ -5481,10 +5521,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { - BestLoQuad = InputQuads.find_first(); - BestHiQuad = InputQuads.find_next(BestLoQuad); + BestLoQuad = InputQuads[0] ? 0 : 1; + BestHiQuad = InputQuads[2] ? 2 : 3; } if (InputQuads.count() > 2) { BestLoQuad = -1; @@ -5497,9 +5537,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // words from all 4 input quadwords. SDValue NewV; if (BestLoQuad >= 0 || BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); - MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); + int MaskV[] = { + BestLoQuad < 0 ? 0 : BestLoQuad, + BestHiQuad < 0 ? 1 : BestHiQuad + }; NewV = DAG.getVectorShuffle(MVT::v2i64, dl, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); @@ -5544,8 +5585,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, unsigned TargetMask = 0; NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); - TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): - X86::getShufflePSHUFLWImmediate(NewV.getNode()); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); + TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): + getShufflePSHUFLWImmediate(SVOp); V1 = NewV.getOperand(0); return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } @@ -5554,7 +5596,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (Subtarget->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If we have elements from both input vectors, set the high bit of the @@ -5602,61 +5644,51 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, // and update MaskVals with new element order. - BitVector InOrder(8); + std::bitset<8> InOrder; if (BestLoQuad >= 0) { - SmallVector<int, 8> MaskV; + int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; for (int i = 0; i != 4; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestLoQuad) { - MaskV.push_back(idx & 3); + MaskV[i] = idx & 3; InOrder.set(i); - } else { - MaskV.push_back(-1); } } - for (unsigned i = 4; i != 8; ++i) - MaskV.push_back(i); NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFLWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFLWImmediate(SVOp), DAG); + } } // If BestHi >= 0, generate a pshufhw to put the high elements in order, // and update MaskVals with the new element order. if (BestHiQuad >= 0) { - SmallVector<int, 8> MaskV; - for (unsigned i = 0; i != 4; ++i) - MaskV.push_back(i); + int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; for (unsigned i = 4; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) { - MaskV.push_back(-1); InOrder.set(i); } else if ((idx / 4) == BestHiQuad) { - MaskV.push_back((idx & 3) + 4); + MaskV[i] = (idx & 3) + 4; InOrder.set(i); - } else { - MaskV.push_back(-1); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); - if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && - (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, - NewV.getOperand(0), - X86::getShufflePSHUFHWImmediate(NewV.getNode()), - DAG); + NewV.getOperand(0), + getShufflePSHUFHWImmediate(SVOp), DAG); + } } // In case BestHi & BestLo were both -1, which means each quadword has a word @@ -5698,8 +5730,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); - SmallVector<int, 16> MaskVals; - SVOp->getMask(MaskVals); + ArrayRef<int> MaskVals = SVOp->getMask(); // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is @@ -5718,7 +5749,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, } // If SSSE3, use 1 pshufb instruction per vector with elements in the result. - if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) { + if (TLI.getSubtarget()->hasSSSE3()) { SmallVector<SDValue,16> pshufbMask; // If all result elements are from one input vector, then only translate @@ -5849,7 +5880,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, unsigned NewWidth = (NumElems == 4) ? 2 : 4; EVT NewVT; switch (VT.getSimpleVT().SimpleTy) { - default: assert(false && "Unexpected!"); + default: llvm_unreachable("Unexpected!"); case MVT::v4f32: NewVT = MVT::v2f64; break; case MVT::v4i32: NewVT = MVT::v2i64; break; case MVT::v8i16: NewVT = MVT::v4i32; break; @@ -5915,96 +5946,89 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, OpVT, SrcOp))); } -/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector -/// shuffle node referes to only one lane in the sources. -static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - SmallVector<int, 16> M; - SVOp->getMask(M); - bool MatchA = false, MatchB = false; - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { - MatchA = true; - break; - } - } - - for (int l = 0; l < NumElems*2; l += HalfSize) { - if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { - MatchB = true; - break; - } - } - - return MatchA && MatchB; -} - /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { - if (areShuffleHalvesWithinDisjointLanes(SVOp)) { - // If each half of a vector shuffle node referes to only one lane in the - // source vectors, extract each used 128-bit lane and shuffle them using - // 128-bit shuffles. Then, concatenate the results. Otherwise leave - // the work to the legalizer. - DebugLoc dl = SVOp->getDebugLoc(); - EVT VT = SVOp->getValueType(0); - int NumElems = VT.getVectorNumElements(); - int HalfSize = NumElems/2; - - // Extract the reference for each half - int FstVecExtractIdx = 0, SndVecExtractIdx = 0; - int FstVecOpNum = 0, SndVecOpNum = 0; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) - continue; - FstVecOpNum = Elt/NumElems; - FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; - } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - if (SVOp->getMaskElt(i) < 0) + EVT VT = SVOp->getValueType(0); + + unsigned NumElems = VT.getVectorNumElements(); + unsigned NumLaneElems = NumElems / 2; + + DebugLoc dl = SVOp->getDebugLoc(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); + SDValue Shufs[2]; + + SmallVector<int, 16> Mask; + for (unsigned l = 0; l < 2; ++l) { + // Build a shuffle mask for the output, discovering on the fly which + // input vectors to use as shuffle operands (recorded in InputUsed). + // If building a suitable shuffle vector proves too hard, then bail + // out with useBuildVector set. + int InputUsed[2] = { -1, -1 }; // Not yet discovered. + unsigned LaneStart = l * NumLaneElems; + for (unsigned i = 0; i != NumLaneElems; ++i) { + // The mask element. This indexes into the input. + int Idx = SVOp->getMaskElt(i+LaneStart); + if (Idx < 0) { + // the mask element does not index into any input vector. + Mask.push_back(-1); continue; - SndVecOpNum = Elt/NumElems; - SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; - break; - } + } - // Extract the subvectors - SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), - DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); - SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), - DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); + // The input vector this mask element indexes into. + int Input = Idx / NumLaneElems; - // Generate 128-bit shuffles - SmallVector<int, 16> MaskV1, MaskV2; - for (int i = 0; i < HalfSize; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); - } - for (int i = HalfSize; i < NumElems; ++i) { - int Elt = SVOp->getMaskElt(i); - MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); + // Turn the index into an offset from the start of the input vector. + Idx -= Input * NumLaneElems; + + // Find or create a shuffle vector operand to hold this input. + unsigned OpNo; + for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { + if (InputUsed[OpNo] == Input) + // This input vector is already an operand. + break; + if (InputUsed[OpNo] < 0) { + // Create a new operand for this input vector. + InputUsed[OpNo] = Input; + break; + } + } + + if (OpNo >= array_lengthof(InputUsed)) { + // More than two input vectors used! Give up. + return SDValue(); + } + + // Add the mask index for the new shuffle vector. + Mask.push_back(Idx + OpNo * NumLaneElems); } - EVT NVT = V1.getValueType(); - V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); - V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); + if (InputUsed[0] < 0) { + // No input vectors were used! The result is undefined. + Shufs[l] = DAG.getUNDEF(NVT); + } else { + SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), + DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // If only one input was used, use an undefined vector for the other. + SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : + Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), + DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32), + DAG, dl); + // At least one input vector was used. Create a new shuffle vector. + Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); + } - // Concatenate the result back - SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, - DAG.getConstant(0, MVT::i32), DAG, dl); - return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), - DAG, dl); + Mask.clear(); } - return SDValue(); + // Concatenate the result back + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0], + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32), + DAG, dl); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with @@ -6018,11 +6042,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); - SmallVector<std::pair<int, int>, 8> Locs; - Locs.resize(4); - SmallVector<int, 8> Mask1(4U, -1); - SmallVector<int, 8> PermMask; - SVOp->getMask(PermMask); + std::pair<int, int> Locs[4]; + int Mask1[] = { -1, -1, -1, -1 }; + SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end()); unsigned NumHi = 0; unsigned NumLo = 0; @@ -6052,17 +6074,14 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // vector operands, put the elements into the right order. V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); - SmallVector<int, 8> Mask2(4U, -1); + int Mask2[] = { -1, -1, -1, -1 }; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) - continue; - else { + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) { unsigned Idx = (i < 2) ? 0 : 4; Idx += Locs[i].first * 2 + Locs[i].second; Mask2[i] = Idx; } - } return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); } else if (NumLo == 3 || NumHi == 3) { @@ -6075,7 +6094,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // from X. if (NumHi == 3) { // Normalize it so the 3 elements come from V1. - CommuteVectorShuffleMask(PermMask, VT); + CommuteVectorShuffleMask(PermMask, 4); std::swap(V1, V2); } @@ -6115,18 +6134,16 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } // Break it into (shuffle shuffle_hi, shuffle_lo). - Locs.clear(); - Locs.resize(4); - SmallVector<int,8> LoMask(4U, -1); - SmallVector<int,8> HiMask(4U, -1); + int LoMask[] = { -1, -1, -1, -1 }; + int HiMask[] = { -1, -1, -1, -1 }; - SmallVector<int,8> *MaskPtr = &LoMask; + int *MaskPtr = LoMask; unsigned MaskIdx = 0; unsigned LoIdx = 0; unsigned HiIdx = 2; for (unsigned i = 0; i != 4; ++i) { if (i == 2) { - MaskPtr = &HiMask; + MaskPtr = HiMask; MaskIdx = 1; LoIdx = 0; HiIdx = 2; @@ -6136,26 +6153,21 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { Locs[i] = std::make_pair(-1, -1); } else if (Idx < 4) { Locs[i] = std::make_pair(MaskIdx, LoIdx); - (*MaskPtr)[LoIdx] = Idx; + MaskPtr[LoIdx] = Idx; LoIdx++; } else { Locs[i] = std::make_pair(MaskIdx, HiIdx); - (*MaskPtr)[HiIdx] = Idx; + MaskPtr[HiIdx] = Idx; HiIdx++; } } SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); - SmallVector<int, 8> MaskOps; - for (unsigned i = 0; i != 4; ++i) { - if (Locs[i].first == -1) { - MaskOps.push_back(-1); - } else { - unsigned Idx = Locs[i].first * 4 + Locs[i].second; - MaskOps.push_back(Idx); - } - } + int MaskOps[] = { -1, -1, -1, -1 }; + for (unsigned i = 0; i != 4; ++i) + if (Locs[i].first != -1) + MaskOps[i] = Locs[i].first * 4 + Locs[i].second; return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); } @@ -6164,6 +6176,10 @@ static bool MayFoldVectorLoad(SDValue V) { V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && + V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) + // BUILD_VECTOR (load), undef + V = V.getOperand(0); if (MayFoldLoad(V)) return true; return false; @@ -6186,82 +6202,6 @@ static bool RelaxedMayFoldVectorLoad(SDValue V) { return false; } -/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by -/// a vector extract, and if both can be later optimized into a single load. -/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked -/// here because otherwise a target specific shuffle node is going to be -/// emitted for this shuffle, and the optimization not done. -/// FIXME: This is probably not the best approach, but fix the problem -/// until the right path is decided. -static -bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, - const TargetLowering &TLI) { - EVT VT = V.getValueType(); - ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); - - // Be sure that the vector shuffle is present in a pattern like this: - // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) - if (!V.hasOneUse()) - return false; - - SDNode *N = *V.getNode()->use_begin(); - if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return false; - - SDValue EltNo = N->getOperand(1); - if (!isa<ConstantSDNode>(EltNo)) - return false; - - // If the bit convert changed the number of elements, it is unsafe - // to examine the mask. - bool HasShuffleIntoBitcast = false; - if (V.getOpcode() == ISD::BITCAST) { - EVT SrcVT = V.getOperand(0).getValueType(); - if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) - return false; - V = V.getOperand(0); - HasShuffleIntoBitcast = true; - } - - // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); - unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); - V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); - - // Skip one more bit_convert if necessary - if (V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (ISD::isNormalLoad(V.getNode())) { - // Is the original load suitable? - LoadSDNode *LN0 = cast<LoadSDNode>(V); - - // FIXME: avoid the multi-use bug that is preventing lots of - // of foldings to be detected, this is still wrong of course, but - // give the temporary desired behavior, and if it happens that - // the load has real more uses, during isel it will not fold, and - // will generate poor code. - if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() - return false; - - if (!HasShuffleIntoBitcast) - return true; - - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - unsigned NewAlign = - TLI.getTargetData()->getABITypeAlignment( - VT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return false; - } - - return true; -} - static SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { EVT VT = Op.getValueType(); @@ -6275,14 +6215,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { static SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, - bool HasXMMInt) { + bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); assert(VT != MVT::v2i64 && "unsupported shuffle type"); - if (HasXMMInt && VT == MVT::v2f64) + if (HasSSE2 && VT == MVT::v2f64) return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) @@ -6308,24 +6248,8 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); } -static inline unsigned getSHUFPOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: - case MVT::v4i32: // Use fp unit for int unpack. - case MVT::v4f32: return X86ISD::SHUFPS; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: - case MVT::v2i64: // Use fp unit for int unpack. - case MVT::v2f64: return X86ISD::SHUFPD; - default: - llvm_unreachable("Unknown type for shufp*"); - } - return 0; -} - static -SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); EVT VT = Op.getValueType(); @@ -6346,32 +6270,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { // turns into: // (MOVLPSmr addr:$src1, VR128:$src2) // So, recognize this potential and also use MOVLPS or MOVLPD - if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) CanFoldLoad = true; - // Both of them can't be memory operations though. - if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) - CanFoldLoad = false; - + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); if (CanFoldLoad) { - if (HasXMMInt && NumElems == 2) + if (HasSSE2 && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); if (NumElems == 4) - return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + // If we don't care about the second element, procede to use movss. + if (SVOp->getMaskElt(1) != -1) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); } - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); // movl and movlp will both match v2i64, but v2i64 is never matched by // movl earlier because we make it strict to avoid messing with the movlp load // folding logic (see the code above getMOVLP call). Match it here then, // this is horrible, but will stay like this until we move all shuffle // matching to x86 specific nodes. Note that for the 1st condition all // types are matched with movsd. - if (HasXMMInt) { + if (HasSSE2) { // FIXME: isMOVLMask should be checked and matched before getMOVLP, // as to remove this logic from here, as much as possible - if (NumElems == 2 || !X86::isMOVLMask(SVOp)) + if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); } @@ -6379,112 +6301,12 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { assert(VT != MVT::v4i32 && "unsupported shuffle type"); // Invert the operand order and use SHUFPS to match it. - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, - X86::getShuffleSHUFImmediate(SVOp), DAG); -} - -static inline unsigned getUNPCKLOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKLDQ; - case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v2f64: return X86ISD::UNPCKLPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKLPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKLPDY; - case MVT::v16i8: return X86ISD::PUNPCKLBW; - case MVT::v8i16: return X86ISD::PUNPCKLWD; - default: - llvm_unreachable("Unknown type for unpckl"); - } - return 0; -} - -static inline unsigned getUNPCKHOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: return X86ISD::PUNPCKHDQ; - case MVT::v2i64: return X86ISD::PUNPCKHQDQ; - case MVT::v4f32: return X86ISD::UNPCKHPS; - case MVT::v2f64: return X86ISD::UNPCKHPD; - case MVT::v8i32: // Use fp unit for int unpack. - case MVT::v8f32: return X86ISD::VUNPCKHPSY; - case MVT::v4i64: // Use fp unit for int unpack. - case MVT::v4f64: return X86ISD::VUNPCKHPDY; - case MVT::v16i8: return X86ISD::PUNPCKHBW; - case MVT::v8i16: return X86ISD::PUNPCKHWD; - default: - llvm_unreachable("Unknown type for unpckh"); - } - return 0; -} - -static inline unsigned getVPERMILOpcode(EVT VT) { - switch(VT.getSimpleVT().SimpleTy) { - case MVT::v4i32: - case MVT::v4f32: return X86ISD::VPERMILPS; - case MVT::v2i64: - case MVT::v2f64: return X86ISD::VPERMILPD; - case MVT::v8i32: - case MVT::v8f32: return X86ISD::VPERMILPSY; - case MVT::v4i64: - case MVT::v4f64: return X86ISD::VPERMILPDY; - default: - llvm_unreachable("Unknown type for vpermil"); - } - return 0; -} - -/// isVectorBroadcast - Check if the node chain is suitable to be xformed to -/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming -/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. -static bool isVectorBroadcast(SDValue &Op) { - EVT VT = Op.getValueType(); - bool Is256 = VT.getSizeInBits() == 256; - - assert((VT.getSizeInBits() == 128 || Is256) && - "Unsupported type for vbroadcast node"); - - SDValue V = Op; - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) - V = V.getOperand(0); - - if (Is256 && !(V.hasOneUse() && - V.getOpcode() == ISD::INSERT_SUBVECTOR && - V.getOperand(0).getOpcode() == ISD::UNDEF)) - return false; - - if (Is256) - V = V.getOperand(1); - - if (!V.hasOneUse()) - return false; - - // Check the source scalar_to_vector type. 256-bit broadcasts are - // supported for 32/64-bit sizes, while 128-bit ones are only supported - // for 32-bit scalars. - if (V.getOpcode() != ISD::SCALAR_TO_VECTOR) - return false; - - unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) - return false; - if (!Is256 && ScalarSize == 64) - return false; - - V = V.getOperand(0); - if (!MayFoldLoad(V)) - return false; - - // Return the load node - Op = V; - return true; + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1, + getShuffleSHUFImmediate(SVOp), DAG); } -static -SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, - const TargetLowering &TLI, - const X86Subtarget *Subtarget) { +SDValue +X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); @@ -6492,22 +6314,17 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, SDValue V2 = Op.getOperand(1); if (isZeroShuffle(SVOp)) - return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + return getZeroVector(VT, Subtarget, DAG, dl); // Handle splat operations if (SVOp->isSplat()) { unsigned NumElem = VT.getVectorNumElements(); int Size = VT.getSizeInBits(); - // Special case, this is the only place now where it's allowed to return - // a vector_shuffle operation without using a target specific node, because - // *hopefully* it will be optimized away by the dag combiner. FIXME: should - // this be moved to DAGCombine instead? - if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) - return Op; // Use vbroadcast whenever the splat comes from a foldable load - if (Subtarget->hasAVX() && isVectorBroadcast(V1)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); + SDValue Broadcast = LowerVectorBroadcast(Op, DAG); + if (Broadcast.getNode()) + return Broadcast; // Handle splats by matching through known shuffle masks if ((Size == 128 && NumElem <= 4) || @@ -6525,21 +6342,26 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (NewOp.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || - (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { + (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. if (ISD::isBuildVectorAllZeros(V2.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) { - if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), + EVT NewVT = NewOp.getValueType(); + if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), + NewVT, true, false)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget, dl); } } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); - if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) - return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), - DAG, Subtarget, dl); + if (NewOp.getNode()) { + EVT NewVT = NewOp.getValueType(); + if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT)) + return getVZextMovL(VT, NewVT, NewOp.getOperand(1), + DAG, Subtarget, dl); + } } } return SDValue(); @@ -6553,18 +6375,22 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); unsigned NumElems = VT.getVectorNumElements(); - bool isMMX = VT.getSizeInBits() == 64; bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsSplat = false; bool V2IsSplat = false; - bool HasXMMInt = Subtarget->hasXMMInt(); + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasAVX = Subtarget->hasAVX(); + bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); - // Shuffle operations on MMX not supported. - if (isMMX) - return Op; + assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + assert(!V1IsUndef && "Op 1 of shuffle should not be undef"); // Vector shuffle lowering takes 3 steps: // @@ -6576,50 +6402,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // so the shuffle can be broken into other shuffles and the legalizer can // try the lowering again. // - // The general ideia is that no vector_shuffle operation should be left to + // The general idea is that no vector_shuffle operation should be left to // be matched during isel, all of them must be converted to a target specific // node here. // Normalize the input vectors. Here splats, zeroed vectors, profitable // narrowing and commutation of operands should be handled. The actual code // doesn't include all of those, work in progress... - SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); + SDValue NewOp = NormalizeVectorShuffle(Op, DAG); if (NewOp.getNode()) return NewOp; + SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end()); + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and // unpckh_undef). Only use pshufd if speed is more important than size. - if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); - if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isMOVDDUPMask(SVOp) && - (Subtarget->hasSSE3() || Subtarget->hasAVX()) && + if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() && V2IsUndef && RelaxedMayFoldVectorLoad(V1)) return getMOVDDup(Op, dl, V1, DAG); - if (X86::isMOVHLPS_v_undef_Mask(SVOp)) + if (isMOVHLPS_v_undef_Mask(M, VT)) return getMOVHighToLow(Op, dl, DAG); // Use to match splats - if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef && + if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef && (VT == MVT::v2f64 || VT == MVT::v2i64)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); - if (X86::isPSHUFDMask(SVOp)) { + if (isPSHUFDMask(M, VT)) { // The actual implementation will match the mask in the if above and then // during isel it can match several different instructions, not only pshufd // as its name says, sad but true, emulate the behavior for now... - if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) - return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = getShuffleSHUFImmediate(SVOp); - unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG); - if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) + if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, TargetMask, DAG); } @@ -6627,8 +6457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool isLeft = false; unsigned ShAmt = 0; SDValue ShVal; - bool isShift = getSubtarget()->hasXMMInt() && - isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); if (isShift && ShVal.hasOneUse()) { // If the shifted value has multiple uses, it may be cheaper to use // v_set0 + movlhps or movhlps, etc. @@ -6637,13 +6466,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); } - if (X86::isMOVLMask(SVOp)) { - if (V1IsUndef) - return V2; + if (isMOVLMask(M, VT)) { if (ISD::isBuildVectorAllZeros(V1.getNode())) return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); - if (!X86::isMOVLPMask(SVOp)) { - if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) + if (!isMOVLPMask(M, VT)) { + if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); if (VT == MVT::v4i32 || VT == MVT::v4f32) @@ -6652,27 +6479,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } // FIXME: fold these into legal mask. - if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) - return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); + if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2)) + return getMOVLowToHigh(Op, dl, DAG, HasSSE2); - if (X86::isMOVHLPSMask(SVOp)) + if (isMOVHLPSMask(M, VT)) return getMOVHighToLow(Op, dl, DAG); - if (X86::isMOVSHDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); - if (X86::isMOVSLDUPMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); - if (X86::isMOVLPMask(SVOp)) - return getMOVLP(Op, dl, DAG, HasXMMInt); + if (isMOVLPMask(M, VT)) + return getMOVLP(Op, dl, DAG, HasSSE2); - if (ShouldXformToMOVHLPS(SVOp) || - ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) + if (ShouldXformToMOVHLPS(M, VT) || + ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT)) return CommuteVectorShuffle(SVOp, DAG); if (isShift) { - // No better options. Use a vshl / vsrl. + // No better options. Use a vshldq / vsrldq. EVT EltVT = VT.getVectorElementType(); ShAmt *= EltVT.getSizeInBits(); return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); @@ -6685,17 +6512,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { V2IsSplat = isSplatVector(V2.getNode()); // Canonicalize the splat or undef, if present, to be on the RHS. - if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { - Op = CommuteVectorShuffle(SVOp, DAG); - SVOp = cast<ShuffleVectorSDNode>(Op); - V1 = SVOp->getOperand(0); - V2 = SVOp->getOperand(1); + if (!V2IsUndef && V1IsSplat && !V2IsSplat) { + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); std::swap(V1IsSplat, V2IsSplat); - std::swap(V1IsUndef, V2IsUndef); Commuted = true; } - if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { + if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) { // Shuffling low element of v1 into undef, just return v1. if (V2IsUndef) return V1; @@ -6705,81 +6529,77 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getMOVL(DAG, dl, VT, V2, V1); } - if (X86::isUNPCKLMask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); if (V2IsSplat) { // Normalize mask so all entries that point to V2 points to its first // element then try to match unpck{h|l} again. If match, return a - // new vector_shuffle with the corrected mask. - SDValue NewMask = NormalizeMask(SVOp, DAG); - ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); - if (NSVOp != SVOp) { - if (X86::isUNPCKLMask(NSVOp, true)) { - return NewMask; - } else if (X86::isUNPCKHMask(NSVOp, true)) { - return NewMask; - } + // new vector_shuffle with the corrected mask.p + SmallVector<int, 8> NewMask(M.begin(), M.end()); + NormalizeMask(NewMask, NumElems); + if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); + } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) { + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } } if (Commuted) { // Commute is back and try unpck* again. // FIXME: this seems wrong. - SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); - ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); + CommuteVectorShuffleMask(M, NumElems); + std::swap(V1, V2); + std::swap(V1IsSplat, V2IsSplat); + Commuted = false; - if (X86::isUNPCKLMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + if (isUNPCKLMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG); - if (X86::isUNPCKHMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); + if (isUNPCKHMask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG); } // Normalize the node to match x86 shuffle ops if needed - if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true))) return CommuteVectorShuffle(SVOp, DAG); // The checks below are all present in isShuffleMaskLegal, but they are // inlined here right now to enable us to directly emit target specific // nodes, and remove one by one until they don't return Op anymore. - SmallVector<int, 16> M; - SVOp->getMask(M); - if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX())) + if (isPALIGNRMask(M, VT, Subtarget)) return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, - X86::getShufflePALIGNRImmediate(SVOp), + getShufflePALIGNRImmediate(SVOp), DAG); if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); - if (VT == MVT::v2i64) - return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); } if (isPSHUFHWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, - X86::getShufflePSHUFHWImmediate(SVOp), + getShufflePSHUFHWImmediate(SVOp), DAG); if (isPSHUFLWMask(M, VT)) return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, - X86::getShufflePSHUFLWImmediate(SVOp), + getShufflePSHUFLWImmediate(SVOp), DAG); - if (isSHUFPMask(M, VT)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - X86::getShuffleSHUFImmediate(SVOp), DAG); + if (isSHUFPMask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2, + getShuffleSHUFImmediate(SVOp), DAG); - if (X86::isUNPCKL_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); - if (X86::isUNPCKH_v_undef_Mask(SVOp)) - return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG); + if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2)) + return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG); //===--------------------------------------------------------------------===// // Generate target specific nodes for 128 or 256-bit shuffles only @@ -6787,33 +6607,26 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // // Handle VMOVDDUPY permutations - if (isMOVDDUPYMask(SVOp, Subtarget)) + if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX)) return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); - // Handle VPERMILPS* permutations - if (isVPERMILPSMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPSImmediate(SVOp), DAG); - - // Handle VPERMILPD* permutations - if (isVPERMILPDMask(M, VT, Subtarget)) - return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, - getShuffleVPERMILPDImmediate(SVOp), DAG); - - // Handle VPERM2F128 permutations - if (isVPERM2F128Mask(M, VT, Subtarget)) - return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, - getShuffleVPERM2F128Immediate(SVOp), DAG); + // Handle VPERMILPS/D* permutations + if (isVPERMILPMask(M, VT, HasAVX)) { + if (HasAVX2 && VT == MVT::v8i32) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + getShuffleSHUFImmediate(SVOp), DAG); + } - // Handle VSHUFPSY permutations - if (isVSHUFPSYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPSYImmediate(SVOp), DAG); + // Handle VPERM2F128/VPERM2I128 permutations + if (isVPERM2X128Mask(M, VT, HasAVX)) + return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1, + V2, getShuffleVPERM2X128Immediate(SVOp), DAG); - // Handle VSHUFPDY permutations - if (isVSHUFPDYMask(M, VT, Subtarget)) - return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, - getShuffleVSHUFPDYImmediate(SVOp), DAG); + SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; //===--------------------------------------------------------------------===// // Since no target specific shuffle was selected for this generic one, @@ -6896,8 +6709,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1)); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); - } else if (VT == MVT::i32) { - // ExtractPS works with constant index. + } else if (VT == MVT::i32 || VT == MVT::i64) { + // ExtractPS/pextrq works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) return Op; } @@ -6933,7 +6746,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) return Res; @@ -7036,7 +6849,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { + } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && + isa<ConstantSDNode>(N2)) { // PINSR* works with constant index. return Op; } @@ -7074,7 +6888,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); } - if (Subtarget->hasSSE41() || Subtarget->hasAVX()) + if (Subtarget->hasSSE41()) return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); if (EltVT == MVT::i8) @@ -7276,7 +7090,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); return Result; } @@ -7344,7 +7158,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -7423,7 +7237,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0), - MachinePointerInfo(Ptr), false, false, 0); + MachinePointerInfo(Ptr), + false, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -7449,7 +7264,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, if (model == TLSModel::InitialExec) Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, 0); + MachinePointerInfo::getGOT(), false, false, false, 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -7471,8 +7286,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) GV = GA->resolveAliasedGlobal(false); - TLSModel::Model model - = getTLSModel(GV, getTargetMachine().getRelocationModel()); + TLSModel::Model model = getTargetMachine().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -7529,19 +7343,77 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), Chain.getValue(1)); - } + } else if (Subtarget->isTargetWindows()) { + // Just use the implicit TLS architecture + // Need to generate someting similar to: + // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage + // ; from TEB + // mov ecx, dword [rel _tls_index]: Load index (from C runtime) + // mov rcx, qword [rdx+rcx*8] + // mov eax, .tls$:tlsvar + // [rax+rcx] contains the address + // Windows 64bit: gs:0x58 + // Windows 32bit: fs:__tls_array - assert(false && - "TLS not implemented for this target."); + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + DebugLoc dl = GA->getDebugLoc(); + SDValue Chain = DAG.getEntryNode(); - llvm_unreachable("Unreachable"); - return SDValue(); + // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or + // %gs:0x58 (64-bit). + Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + ? Type::getInt8PtrTy(*DAG.getContext(), + 256) + : Type::getInt32PtrTy(*DAG.getContext(), + 257)); + + SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, + Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58) + : DAG.getExternalSymbol("_tls_array", + getPointerTy()), + MachinePointerInfo(Ptr), + false, false, false, 0); + + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy()); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, + IDX, MachinePointerInfo(), MVT::i32, + false, false, 0); + else + IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), + false, false, false, 0); + + SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()), + getPointerTy()); + IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale); + + SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX); + res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(), + false, false, false, 0); + + // Get the offset of start of .tls section + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset); + } + + llvm_unreachable("TLS not implemented for this target."); } -/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and -/// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// and take a 2 x i32 value to shift plus a shift amount. +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{ assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -7673,7 +7545,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Op.getValueType(), MMO); Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + false, false, false, 0); } return Result; @@ -7682,85 +7554,65 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const { - // This algorithm is not obvious. Here it is in C code, more or less: + // This algorithm is not obvious. Here it is what we're trying to output: /* - double uint64_to_double( uint32_t hi, uint32_t lo ) { - static const __m128i exp = { 0x4330000045300000ULL, 0 }; - static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; - - // Copy ints to xmm registers. - __m128i xh = _mm_cvtsi32_si128( hi ); - __m128i xl = _mm_cvtsi32_si128( lo ); - - // Combine into low half of a single xmm register. - __m128i x = _mm_unpacklo_epi32( xh, xl ); - __m128d d; - double sd; - - // Merge in appropriate exponents to give the integer bits the right - // magnitude. - x = _mm_unpacklo_epi32( x, exp ); - - // Subtract away the biases to deal with the IEEE-754 double precision - // implicit 1. - d = _mm_sub_pd( (__m128d) x, bias ); - - // All conversions up to here are exact. The correctly rounded result is - // calculated using the current rounding mode using the following - // horizontal add. - d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); - _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this - // store doesn't really need to be here (except - // maybe to zero the other double) - return sd; - } + movq %rax, %xmm0 + punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } + subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } + #ifdef __SSE3__ + haddpd %xmm0, %xmm0 + #else + pshufd $0x4e, %xmm0, %xmm1 + addpd %xmm1, %xmm0 + #endif */ DebugLoc dl = Op.getDebugLoc(); LLVMContext *Context = DAG.getContext(); // Build some magic constants. - std::vector<Constant*> CV0; - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); - Constant *C0 = ConstantVector::get(CV0); + const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + Constant *C0 = ConstantDataVector::get(*Context, CV0); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); - std::vector<Constant*> CV1; + SmallVector<Constant*,2> CV1; CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); CV1.push_back( - ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(1))); - SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, - Op.getOperand(0), - DAG.getIntPtrConstant(0))); - SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); + // Load the 64-bit value into an XMM register. + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Op.getOperand(0)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(), - false, false, 16); - SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); - SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); + false, false, false, 16); + SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1), + CLod0); + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Result; + + if (Subtarget->hasSSE3()) { + // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. + Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + } else { + SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub); + SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, + S2F, 0x4E, DAG); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle), + Sub); + } - // Add the halves; easiest way is to swap them into another reg first. - int ShufMask[2] = { 1, -1 }; - SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, - DAG.getUNDEF(MVT::v2f64), ShufMask); - SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0)); } @@ -7777,8 +7629,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, Op.getOperand(0)); // Zero out the upper parts of the register. - Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), - DAG); + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), @@ -7830,6 +7681,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return LowerUINT_TO_FP_i64(Op, DAG); else if (SrcVT == MVT::i32 && X86ScalarSSEf64) return LowerUINT_TO_FP_i32(Op, DAG); + else if (Subtarget->is64Bit() && + SrcVT == MVT::i64 && DstVT == MVT::f32) + return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); @@ -7849,7 +7703,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo(), + StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in @@ -7897,19 +7751,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } std::pair<SDValue,SDValue> X86TargetLowering:: -FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { +FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const { DebugLoc DL = Op.getDebugLoc(); EVT DstTy = Op.getValueType(); - if (!IsSigned) { + if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && - "Unknown FP_TO_SINT to lower!"); + "Unknown FP_TO_INT to lower!"); // These are really Legal. if (DstTy == MVT::i32 && @@ -7920,26 +7774,29 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary - // stack slot. + // We lower FP->int64 either into FISTP64 followed by a load from a temporary + // stack slot, or into the FTOL runtime function. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); - - unsigned Opc; - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + if (!IsSigned && isIntegerTypeFTOL(DstTy)) + Opc = X86ISD::WIN_FTOL; + else + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); EVT TheVT = Op.getOperand(0).getValueType(); + // FIXME This causes a redundant load/store if the SSE-class value is already + // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, @@ -7964,12 +7821,26 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), MachineMemOperand::MOStore, MemSize, MemSize); - // Build the FP_TO_INT*_IN_MEM - SDValue Ops[] = { Chain, Value, StackSlot }; - SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), - Ops, 3, DstTy, MMO); - - return std::make_pair(FIST, StackSlot); + if (Opc != X86ISD::WIN_FTOL) { + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, 3, DstTy, MMO); + return std::make_pair(FIST, StackSlot); + } else { + SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, + DAG.getVTList(MVT::Other, MVT::Glue), + Chain, Value); + SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, + MVT::i32, ftol.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, + MVT::i32, eax.getValue(2)); + SDValue Ops[] = { eax, edx }; + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2) + : DAG.getMergeValues(Ops, 2, DL); + return std::make_pair(pair, SDValue()); + } } SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, @@ -7977,25 +7848,37 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, if (Op.getValueType().isVector()) return SDValue(); - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. if (FIST.getNode() == 0) return Op; - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; assert(FIST.getNode() && "Unexpected failure"); - // Load the result. - return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, MachinePointerInfo(), false, false, 0); + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + else + // The node is the result. + return FIST; } SDValue X86TargetLowering::LowerFABS(SDValue Op, @@ -8006,23 +7889,18 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, EVT EltVT = VT; if (VT.isVector()) EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; + Constant *C; if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); - CV.push_back(C); - CV.push_back(C); + C = ConstantVector::getSplat(2, + ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + C = ConstantVector::getSplat(4, + ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); } - Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8031,31 +7909,28 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - std::vector<Constant*> CV; - if (EltVT == MVT::f64) { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); - CV.push_back(C); - CV.push_back(C); - } else { - Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); - CV.push_back(C); + NumElts = VT.getVectorNumElements(); } - Constant *C = ConstantVector::get(CV); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + C = ConstantVector::getSplat(NumElts, C); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); if (VT.isVector()) { + MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::XOR, dl, XORVT, + DAG.getNode(ISD::BITCAST, dl, XORVT, Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -8084,7 +7959,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // type, and that won't be f80 since that is not custom lowered. // First get the sign bit of second operand. - std::vector<Constant*> CV; + SmallVector<Constant*,4> CV; if (SrcVT == MVT::f64) { CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); @@ -8098,7 +7973,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. @@ -8127,7 +8002,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, 16); + false, false, false, 16); SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); // Or the value with the sign bit. @@ -8191,8 +8066,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, // climbing the DAG back to the root, and it doesn't seem to be worth the // effort. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) goto default_case; if (ConstantSDNode *C = @@ -8325,8 +8202,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { - APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; - DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); + APInt Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Zeros, Ones); if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) return SDValue(); } @@ -8335,11 +8212,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; - if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { + + if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); } + + // Use BT if the immediate can't be encoded in a TEST instruction. + if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { + LHS = AndLHS; + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType()); + } } if (LHS.getNode()) { @@ -8466,9 +8351,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (isFP) { unsigned SSECC = 8; EVT EltVT = Op0.getValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT; - unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; bool Swap = false; // SSE Condition code mapping: @@ -8508,61 +8392,57 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (SSECC == 8) { if (SetCCOpcode == ISD::SETUEQ) { SDValue UNORD, EQ; - UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); - EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(0, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); - } - else if (SetCCOpcode == ISD::SETONE) { + } else if (SetCCOpcode == ISD::SETONE) { SDValue ORD, NEQ; - ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); - NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); + ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(7, MVT::i8)); + NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(4, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); } llvm_unreachable("Illegal FP comparison"); } // Handle all other FP comparisons here. - return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, MVT::i8)); } // Break 256-bit integer vector compare into smaller ones. - if (!isFP && VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntVSETCC(Op, DAG); // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. - unsigned Opc = 0, EQOpc = 0, GTOpc = 0; + unsigned Opc = 0; bool Swap = false, Invert = false, FlipSigns = false; - switch (VT.getSimpleVT().SimpleTy) { - default: break; - case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; - case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; - case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; - case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; - } - switch (SetCCOpcode) { default: break; case ISD::SETNE: Invert = true; - case ISD::SETEQ: Opc = EQOpc; break; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; case ISD::SETLT: Swap = true; - case ISD::SETGT: Opc = GTOpc; break; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; case ISD::SETGE: Swap = true; - case ISD::SETLE: Opc = GTOpc; Invert = true; break; + case ISD::SETLE: Opc = X86ISD::PCMPGT; Invert = true; break; case ISD::SETULT: Swap = true; - case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break; case ISD::SETUGE: Swap = true; - case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; + case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break; } if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). - if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42()) return SDValue(); - if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX()) + if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41()) return SDValue(); // Since SSE has no unsigned integer comparisons, we need to flip the sign @@ -8679,8 +8559,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8697,6 +8578,39 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = Cmp; addTest = false; } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } if (addTest) { @@ -8778,11 +8692,27 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); SDValue CC; + bool Inverted = false; if (Cond.getOpcode() == ISD::SETCC) { - SDValue NewCond = LowerSETCC(Cond, DAG); - if (NewCond.getNode()) - Cond = NewCond; + // Check for setcc([su]{add,sub,mul}o == 0). + if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + isa<ConstantSDNode>(Cond.getOperand(1)) && + cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + Cond.getOperand(0).getResNo() == 1 && + (Cond.getOperand(0).getOpcode() == ISD::SADDO || + Cond.getOperand(0).getOpcode() == ISD::UADDO || + Cond.getOperand(0).getOpcode() == ISD::SSUBO || + Cond.getOperand(0).getOpcode() == ISD::USUBO || + Cond.getOperand(0).getOpcode() == ISD::SMULO || + Cond.getOperand(0).getOpcode() == ISD::UMULO)) { + Inverted = true; + Cond = Cond.getOperand(0); + } else { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } } #if 0 // FIXME: LowerXALUO doesn't handle these!! @@ -8803,8 +8733,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. - if (Cond.getOpcode() == X86ISD::SETCC || - Cond.getOpcode() == X86ISD::SETCC_CARRY) { + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); @@ -8825,6 +8756,43 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { break; } } + } + CondOpcode = Cond.getOpcode(); + if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (Inverted) + X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, MVT::i8); + addTest = false; } else { unsigned CondOpc; if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { @@ -8888,6 +8856,66 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { CC = DAG.getConstant(CCode, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + // For FCMP_OEQ, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_P, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + // For FCMP_UNE, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_UNE. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_NP, MVT::i8); + Cond = Cmp; + addTest = false; + Dest = FalseBB; + } + } } } @@ -8926,7 +8954,7 @@ SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || - EnableSegmentedStacks) && + getTargetMachine().Options.EnableSegmentedStacks) && "This should be used only on Windows targets or when segmented stacks " "are being used"); assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); @@ -8940,7 +8968,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, bool Is64Bit = Subtarget->is64Bit(); EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; - if (EnableSegmentedStacks) { + if (getTargetMachine().Options.EnableSegmentedStacks) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -9076,10 +9104,10 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. - assert(!UseSoftFloat && + assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && - Subtarget->hasXMM()); + Subtarget->hasSSE1()); } // Insert VAARG_64 node into the DAG @@ -9106,7 +9134,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { Chain, VAARG, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); } SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { @@ -9125,6 +9153,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } +// getTargetVShiftNOde - Handle vector element shifts where the shift amount +// may or may not be a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue SrcOp, SDValue ShAmt, + SelectionDAG &DAG) { + assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + + if (isa<ConstantSDNode>(ShAmt)) { + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: + case X86ISD::VSRLI: + case X86ISD::VSRAI: + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); + } + } + + // Change opcode to non-immediate version + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; + case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; + case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; + } + + // Need to build a vector containing shift amount + // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 + SDValue ShOps[4]; + ShOps[0] = ShAmt; + ShOps[1] = DAG.getConstant(0, MVT::i32); + ShOps[2] = DAG.getUNDEF(MVT::i32); + ShOps[3] = DAG.getUNDEF(MVT::i32); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); +} + SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); @@ -9159,7 +9224,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const unsigned Opc = 0; ISD::CondCode CC = ISD::SETCC_INVALID; switch (IntNo) { - default: break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_comieq_ss: case Intrinsic::x86_sse2_comieq_sd: Opc = X86ISD::COMI; @@ -9231,7 +9296,201 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const DAG.getConstant(X86CC, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + // XOP comparison intrinsics + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: { + unsigned CC = 0; + unsigned Opc = 0; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_xop_vpcomltb: + case Intrinsic::x86_xop_vpcomltw: + case Intrinsic::x86_xop_vpcomltd: + case Intrinsic::x86_xop_vpcomltq: + CC = 0; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomltub: + case Intrinsic::x86_xop_vpcomltuw: + case Intrinsic::x86_xop_vpcomltud: + case Intrinsic::x86_xop_vpcomltuq: + CC = 0; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomleb: + case Intrinsic::x86_xop_vpcomlew: + case Intrinsic::x86_xop_vpcomled: + case Intrinsic::x86_xop_vpcomleq: + CC = 1; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomleub: + case Intrinsic::x86_xop_vpcomleuw: + case Intrinsic::x86_xop_vpcomleud: + case Intrinsic::x86_xop_vpcomleuq: + CC = 1; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgtb: + case Intrinsic::x86_xop_vpcomgtw: + case Intrinsic::x86_xop_vpcomgtd: + case Intrinsic::x86_xop_vpcomgtq: + CC = 2; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgtub: + case Intrinsic::x86_xop_vpcomgtuw: + case Intrinsic::x86_xop_vpcomgtud: + case Intrinsic::x86_xop_vpcomgtuq: + CC = 2; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomgeb: + case Intrinsic::x86_xop_vpcomgew: + case Intrinsic::x86_xop_vpcomged: + case Intrinsic::x86_xop_vpcomgeq: + CC = 3; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomgeub: + case Intrinsic::x86_xop_vpcomgeuw: + case Intrinsic::x86_xop_vpcomgeud: + case Intrinsic::x86_xop_vpcomgeuq: + CC = 3; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomeqb: + case Intrinsic::x86_xop_vpcomeqw: + case Intrinsic::x86_xop_vpcomeqd: + case Intrinsic::x86_xop_vpcomeqq: + CC = 4; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomequb: + case Intrinsic::x86_xop_vpcomequw: + case Intrinsic::x86_xop_vpcomequd: + case Intrinsic::x86_xop_vpcomequq: + CC = 4; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomneb: + case Intrinsic::x86_xop_vpcomnew: + case Intrinsic::x86_xop_vpcomned: + case Intrinsic::x86_xop_vpcomneq: + CC = 5; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomneub: + case Intrinsic::x86_xop_vpcomneuw: + case Intrinsic::x86_xop_vpcomneud: + case Intrinsic::x86_xop_vpcomneuq: + CC = 5; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomfalseb: + case Intrinsic::x86_xop_vpcomfalsew: + case Intrinsic::x86_xop_vpcomfalsed: + case Intrinsic::x86_xop_vpcomfalseq: + CC = 6; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomfalseub: + case Intrinsic::x86_xop_vpcomfalseuw: + case Intrinsic::x86_xop_vpcomfalseud: + case Intrinsic::x86_xop_vpcomfalseuq: + CC = 6; + Opc = X86ISD::VPCOMU; + break; + case Intrinsic::x86_xop_vpcomtrueb: + case Intrinsic::x86_xop_vpcomtruew: + case Intrinsic::x86_xop_vpcomtrued: + case Intrinsic::x86_xop_vpcomtrueq: + CC = 7; + Opc = X86ISD::VPCOM; + break; + case Intrinsic::x86_xop_vpcomtrueub: + case Intrinsic::x86_xop_vpcomtrueuw: + case Intrinsic::x86_xop_vpcomtrueud: + case Intrinsic::x86_xop_vpcomtrueuq: + CC = 7; + Opc = X86ISD::VPCOMU; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS, + DAG.getConstant(CC, MVT::i8)); + } + // Arithmetic intrinsics. + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_avx2_pmulu_dq: + return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: case Intrinsic::x86_avx_hadd_ps_256: @@ -9244,6 +9503,62 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const case Intrinsic::x86_avx_hsub_pd_256: return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phadd_w_128: + case Intrinsic::x86_ssse3_phadd_d_128: + case Intrinsic::x86_avx2_phadd_w: + case Intrinsic::x86_avx2_phadd_d: + return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_phsub_w_128: + case Intrinsic::x86_ssse3_phsub_d_128: + case Intrinsic::x86_avx2_phsub_w: + case Intrinsic::x86_avx2_phsub_d: + return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psllv_d: + case Intrinsic::x86_avx2_psllv_q: + case Intrinsic::x86_avx2_psllv_d_256: + case Intrinsic::x86_avx2_psllv_q_256: + return DAG.getNode(ISD::SHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrlv_d: + case Intrinsic::x86_avx2_psrlv_q: + case Intrinsic::x86_avx2_psrlv_d_256: + case Intrinsic::x86_avx2_psrlv_q_256: + return DAG.getNode(ISD::SRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_avx2_psrav_d: + case Intrinsic::x86_avx2_psrav_d_256: + return DAG.getNode(ISD::SRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: + return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_ssse3_psign_b_128: + case Intrinsic::x86_ssse3_psign_w_128: + case Intrinsic::x86_ssse3_psign_d_128: + case Intrinsic::x86_avx2_psign_b: + case Intrinsic::x86_avx2_psign_w: + case Intrinsic::x86_avx2_psign_d: + return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse41_insertps: + return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vperm2f128_ps_256: + case Intrinsic::x86_avx_vperm2f128_pd_256: + case Intrinsic::x86_avx_vperm2f128_si_256: + case Intrinsic::x86_avx2_vperm2i128: + return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::x86_avx_vpermil_ps: + case Intrinsic::x86_avx_vpermil_pd: + case Intrinsic::x86_avx_vpermil_ps_256: + case Intrinsic::x86_avx_vpermil_pd_256: + return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. @@ -9310,16 +9625,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. + // SSE/AVX shift intrinsics + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psra_d: + return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + // Fix vector shift instructions where the last operand is a non-immediate + // i32 value. case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: @@ -9333,79 +9685,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const return SDValue(); unsigned NewIntNo = 0; - EVT ShAmtVT = MVT::v4i32; switch (IntNo) { - case Intrinsic::x86_sse2_pslli_w: - NewIntNo = Intrinsic::x86_sse2_psll_w; - break; - case Intrinsic::x86_sse2_pslli_d: - NewIntNo = Intrinsic::x86_sse2_psll_d; + case Intrinsic::x86_mmx_pslli_w: + NewIntNo = Intrinsic::x86_mmx_psll_w; break; - case Intrinsic::x86_sse2_pslli_q: - NewIntNo = Intrinsic::x86_sse2_psll_q; + case Intrinsic::x86_mmx_pslli_d: + NewIntNo = Intrinsic::x86_mmx_psll_d; break; - case Intrinsic::x86_sse2_psrli_w: - NewIntNo = Intrinsic::x86_sse2_psrl_w; + case Intrinsic::x86_mmx_pslli_q: + NewIntNo = Intrinsic::x86_mmx_psll_q; break; - case Intrinsic::x86_sse2_psrli_d: - NewIntNo = Intrinsic::x86_sse2_psrl_d; + case Intrinsic::x86_mmx_psrli_w: + NewIntNo = Intrinsic::x86_mmx_psrl_w; break; - case Intrinsic::x86_sse2_psrli_q: - NewIntNo = Intrinsic::x86_sse2_psrl_q; + case Intrinsic::x86_mmx_psrli_d: + NewIntNo = Intrinsic::x86_mmx_psrl_d; break; - case Intrinsic::x86_sse2_psrai_w: - NewIntNo = Intrinsic::x86_sse2_psra_w; + case Intrinsic::x86_mmx_psrli_q: + NewIntNo = Intrinsic::x86_mmx_psrl_q; break; - case Intrinsic::x86_sse2_psrai_d: - NewIntNo = Intrinsic::x86_sse2_psra_d; + case Intrinsic::x86_mmx_psrai_w: + NewIntNo = Intrinsic::x86_mmx_psra_w; break; - default: { - ShAmtVT = MVT::v2i32; - switch (IntNo) { - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - } + case Intrinsic::x86_mmx_psrai_d: + NewIntNo = Intrinsic::x86_mmx_psra_d; break; - } + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. } // The vector shift intrinsics with scalars uses 32b shift amounts but // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits // to be zero. - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - if (ShAmtVT == MVT::v4i32) { - ShOps[2] = DAG.getUNDEF(MVT::i32); - ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); - } else { - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, + DAG.getConstant(0, MVT::i32)); // FIXME this must be lowered to get rid of the invalid type. - } EVT VT = Op.getValueType(); ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); @@ -9432,13 +9745,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, MachinePointerInfo(), false, false, 0); + RetAddrFI, MachinePointerInfo(), false, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { @@ -9453,7 +9766,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); return FrameAddr; } @@ -9685,7 +9998,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, - MachinePointerInfo(), false, false, 0); + MachinePointerInfo(), false, false, false, 0); // Transform as necessary SDValue CWD1 = @@ -9745,7 +10058,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, + SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -9753,26 +10067,41 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { Op = Op.getOperand(0); if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } - // Issue a bsf (scan bits forward) which also sets EFLAGS. + // Issue a bsr (scan bits in reverse). SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + Op = Op.getOperand(0); + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { Op, - DAG.getConstant(NumBits, OpVT), + DAG.getConstant(NumBits, VT), DAG.getConstant(X86::COND_E, MVT::i8), Op.getValue(1) }; - Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); - - if (VT == MVT::i8) - Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); - return Op; + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops)); } // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit @@ -9824,49 +10153,49 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()) return Lower256IntArith(Op, DAG); - assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && + "Only know how to lower V2I64/V4I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); - // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); - // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); - // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); - // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); - // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); // - // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); - // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); - SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - A, DAG.getConstant(32, MVT::i32)); - SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - B, DAG.getConstant(32, MVT::i32)); - SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, B); - SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - A, Bhi); - SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), - Ahi, B); - AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AloBhi, DAG.getConstant(32, MVT::i32)); - AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - AhiBlo, DAG.getConstant(32, MVT::i32)); + SDValue ShAmt = DAG.getConstant(32, MVT::i32); + + SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); + SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt); + + // Bit cast to 32-bit vectors for MULUDQ + EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32; + A = DAG.getNode(ISD::BITCAST, dl, MulVT, A); + B = DAG.getNode(ISD::BITCAST, dl, MulVT, B); + Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi); + Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi); + + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + + AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt); + AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt); + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); - Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); - return Res; + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { @@ -9877,12 +10206,183 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + (Subtarget->hasAVX2() && + (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) { + if (Op.getOpcode() == ISD::SHL) + return DAG.getNode(X86ISD::VSHLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VSRLI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64) + return DAG.getNode(X86ISD::VSRAI, dl, VT, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + } + + if (VT == MVT::v16i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 16> V(16, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } + + if (Subtarget->hasAVX2() && VT == MVT::v32i8) { + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL); + // Zero out the rightmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U << ShiftAmt), + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R, + DAG.getConstant(ShiftAmt, MVT::i32)); + SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL); + // Zero out the leftmost bits. + SmallVector<SDValue, 32> V(32, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, + MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32)); + } + if (Op.getOpcode() == ISD::SRA) { + if (ShiftAmt == 7) { + // R s>> 7 === R s< 0 + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // R s>> a === ((R u>> a) ^ m) - m + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt, + MVT::i8)); + SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + } + } + } + + // Lower SHL with variable shift amount. + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { + Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1), + DAG.getConstant(23, MVT::i32)); + + const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U}; + Constant *C = ConstantDataVector::get(*Context, CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { + assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq."); + + // a = a << 5; + Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1), + DAG.getConstant(5, MVT::i32)); + Op = DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // Turn 'a' into a mask suitable for VSELECT + SDValue VSelM = DAG.getConstant(0x80, VT); + SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + SDValue CM1 = DAG.getConstant(0x0f, VT); + SDValue CM2 = DAG.getConstant(0x3f, VT); + + // r = VSELECT(r, psllw(r & (char16)15, 4), a); + SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(4, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // r = VSELECT(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, CM2); + M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M, + DAG.getConstant(2, MVT::i32), DAG); + M = DAG.getNode(ISD::BITCAST, dl, VT, M); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R); + + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op); + OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM); + + // return VSELECT(r, r+r, a); + R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, + DAG.getNode(ISD::ADD, dl, VT, R, R), R); + return R; + } + // Decompose 256-bit shifts into smaller 128-bit shifts. if (VT.getSizeInBits() == 256) { - int NumElems = VT.getVectorNumElements(); + unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType().getSimpleVT(); EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); @@ -9897,9 +10397,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // Constant shift amount SmallVector<SDValue, 4> Amt1Csts; SmallVector<SDValue, 4> Amt2Csts; - for (int i = 0; i < NumElems/2; ++i) + for (unsigned i = 0; i != NumElems/2; ++i) Amt1Csts.push_back(Amt->getOperand(i)); - for (int i = NumElems/2; i < NumElems; ++i) + for (unsigned i = NumElems/2; i != NumElems; ++i) Amt2Csts.push_back(Amt->getOperand(i)); Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, @@ -9921,120 +10421,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); } - // Optimize shl/srl/sra with constant shift amount. - if (isSplatVector(Amt.getNode())) { - SDValue SclrAmt = Amt->getOperand(0); - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { - uint64_t ShiftAmt = C->getZExtValue(); - - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - - if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - R, DAG.getConstant(ShiftAmt, MVT::i32)); - } - } - - // Lower SHL with variable shift amount. - if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - Op.getOperand(1), DAG.getConstant(23, MVT::i32)); - - ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); - - std::vector<Constant*> CV(4, CI); - Constant *C = ConstantVector::get(CV); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); - Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); - Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); - return DAG.getNode(ISD::MUL, dl, VT, Op, R); - } - if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { - // a = a << 5; - Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - Op.getOperand(1), DAG.getConstant(5, MVT::i32)); - - ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); - ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); - - std::vector<Constant*> CVM1(16, CM1); - std::vector<Constant*> CVM2(16, CM2); - Constant *C = ConstantVector::get(CVM1); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - // r = pblendv(r, psllw(r & (char16)15, 4), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - - C = ConstantVector::get(CVM2); - CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); - M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, 16); - - // r = pblendv(r, psllw(r & (char16)63, 2), a); - M = DAG.getNode(ISD::AND, dl, VT, R, M); - M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, - DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); - // a += a - Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); - - // return pblendv(r, r+r, a); - R = DAG.getNode(ISD::VSELECT, dl, VT, Op, - R, DAG.getNode(ISD::ADD, dl, VT, R, R)); - return R; - } return SDValue(); } @@ -10113,46 +10499,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ +SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); - SDNode* Node = Op.getNode(); - EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); - EVT VT = Node->getValueType(0); - if (Subtarget->hasXMMInt() && VT.isVector()) { - unsigned BitsDiff = VT.getScalarType().getSizeInBits() - - ExtraVT.getScalarType().getSizeInBits(); - SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); - - unsigned SHLIntrinsicsID = 0; - unsigned SRAIntrinsicsID = 0; - switch (VT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v4i32: { - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; - break; - } - case MVT::v8i16: { - SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; - SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; - break; - } - } + EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + EVT VT = Op.getValueType(); - SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SHLIntrinsicsID, MVT::i32), - Node->getOperand(0), ShAmt); + if (!Subtarget->hasSSE2() || !VT.isVector()) + return SDValue(); - // In case of 1 bit sext, no need to shr - if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; + unsigned BitsDiff = VT.getScalarType().getSizeInBits() - + ExtraVT.getScalarType().getSizeInBits(); + SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(SRAIntrinsicsID, MVT::i32), - Tmp1, ShAmt); + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v8i32: + case MVT::v16i16: + if (!Subtarget->hasAVX()) + return SDValue(); + if (!Subtarget->hasAVX2()) { + // needs to be split + int NumElems = VT.getVectorNumElements(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + EVT ExtraEltVT = ExtraVT.getVectorElementType(); + int ExtraNumElems = ExtraVT.getVectorNumElements(); + ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT, + ExtraNumElems/2); + SDValue Extra = DAG.getValueType(ExtraVT); + + LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); + LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + } + // fall through + case MVT::v4i32: + case MVT::v8i16: { + SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, + Op.getOperand(0), ShAmt, DAG); + return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG); + } } - - return SDValue(); } @@ -10161,7 +10559,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. // There isn't any reason to disable it if the target processor supports it. - if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { + if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) { SDValue Chain = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { @@ -10215,7 +10613,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for // no-sse2). There isn't any reason to disable it if the target processor // supports it. - if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) + if (Subtarget->hasSSE2() || Subtarget->is64Bit()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -10246,8 +10644,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { unsigned Reg = 0; unsigned size = 0; switch(T.getSimpleVT().SimpleTy) { - default: - assert(false && "Invalid value type!"); + default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; @@ -10295,7 +10692,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && @@ -10365,7 +10762,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { - default: assert(0 && "Invalid code"); + default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = X86ISD::ADD; break; case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; case ISD::SUBC: Opc = X86ISD::SUB; break; @@ -10432,6 +10829,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SRA: @@ -10506,8 +10904,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DebugLoc dl = N->getDebugLoc(); switch (N->getOpcode()) { default: - assert(false && "Do not know how to custom type legalize this operation!"); - return; + llvm_unreachable("Do not know how to custom type legalize this operation!"); case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -10515,15 +10912,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SUBE: // We don't want to expand or promote these. return; - case ISD::FP_TO_SINT: { + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) + return; + std::pair<SDValue,SDValue> Vals = - FP_TO_INTHelper(SDValue(N, 0), DAG, true); + FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; if (FIST.getNode() != 0) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, - MachinePointerInfo(), false, false, 0)); + if (StackSlot.getNode() != 0) + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0)); + else + Results.push_back(FIST); } return; } @@ -10657,15 +11064,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; - case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; - case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; - case X86ISD::PSIGND: return "X86ISD::PSIGND"; + case X86ISD::PSIGN: return "X86ISD::PSIGN"; + case X86ISD::BLENDV: return "X86ISD::BLENDV"; + case X86ISD::BLENDPW: return "X86ISD::BLENDPW"; + case X86ISD::BLENDPS: return "X86ISD::BLENDPS"; + case X86ISD::BLENDPD: return "X86ISD::BLENDPD"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FHADD: return "X86ISD::FHADD"; - case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; @@ -10681,18 +11092,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; + case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; case X86ISD::VSRL: return "X86ISD::VSRL"; - case X86ISD::CMPPD: return "X86ISD::CMPPD"; - case X86ISD::CMPPS: return "X86ISD::CMPPS"; - case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; - case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; - case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; - case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; - case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; - case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; - case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; - case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; + case X86ISD::VSRA: return "X86ISD::VSRA"; + case X86ISD::VSHLI: return "X86ISD::VSHLI"; + case X86ISD::VSRLI: return "X86ISD::VSRLI"; + case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; + case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -10705,54 +11115,39 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::ANDN: return "X86ISD::ANDN"; + case X86ISD::BLSI: return "X86ISD::BLSI"; + case X86ISD::BLSMSK: return "X86ISD::BLSMSK"; + case X86ISD::BLSR: return "X86ISD::BLSR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::PALIGN: return "X86ISD::PALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; - case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; - case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; - case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; - case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; + case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; - case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; - case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; - case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; case X86ISD::MOVSD: return "X86ISD::MOVSD"; case X86ISD::MOVSS: return "X86ISD::MOVSS"; - case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; - case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; - case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; - case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; - case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; - case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; - case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; - case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; - case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; - case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; - case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; - case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; - case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; - case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; - case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; - case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; - case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; } } @@ -10855,21 +11250,21 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const { // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSizeInBits() == 64) - return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()); + return false; // FIXME: pshufb, blends, shifts. return (VT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, VT) || - isSHUFPMask(M, VT) || + isSHUFPMask(M, VT, Subtarget->hasAVX()) || isPSHUFDMask(M, VT) || isPSHUFHWMask(M, VT) || isPSHUFLWMask(M, VT) || - isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) || - isUNPCKLMask(M, VT) || - isUNPCKHMask(M, VT) || - isUNPCKL_v_undef_Mask(M, VT) || - isUNPCKH_v_undef_Mask(M, VT)); + isPALIGNRMask(M, VT, Subtarget) || + isUNPCKLMask(M, VT, Subtarget->hasAVX2()) || + isUNPCKHMask(M, VT, Subtarget->hasAVX2()) || + isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) || + isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2())); } bool @@ -10882,8 +11277,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, if (NumElts == 4 && VT.getSizeInBits() == 128) { return (isMOVLMask(Mask, VT) || isCommutedMOVLMask(Mask, VT, true) || - isSHUFPMask(Mask, VT) || - isCommutedSHUFPMask(Mask, VT)); + isSHUFPMask(Mask, VT, Subtarget->hasAVX()) || + isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true)); } return false; } @@ -10902,7 +11297,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, unsigned CXchgOpc, unsigned notOpc, unsigned EAXreg, - TargetRegisterClass *RC, + const TargetRegisterClass *RC, bool invSrc) const { // For the atomic bitwise operator, we generate // thisMBB: @@ -11274,7 +11669,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && + assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); @@ -11679,6 +12074,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( return EndMBB; } +// The EFLAGS operand of SelectItr might be missing a kill marker +// because there were multiple uses of EFLAGS, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of EFLAGS. + MachineBasicBlock::iterator miI(llvm::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return false; + if (mi.definesRegister(X86::EFLAGS)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return false; + } + } + + // We found a def, or hit the end of the basic block and EFLAGS wasn't live + // out. SelectMI should have a kill flag on EFLAGS. + SelectItr->addRegisterKilled(X86::EFLAGS, TRI); + return true; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -11708,7 +12139,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - if (!MI->killsRegister(X86::EFLAGS)) { + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); + if (!MI->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } @@ -11753,7 +12186,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineFunction *MF = BB->getParent(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - assert(EnableSegmentedStacks); + assert(getTargetMachine().Options.EnableSegmentedStacks); unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; @@ -11785,6 +12218,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), physSPReg = Is64Bit ? X86::RSP : X86::ESP; @@ -11802,33 +12236,39 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg) + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) - .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) - .addReg(tmpSPVReg); + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) - .addReg(tmpSPVReg); + .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Is64Bit) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); + .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI) + .addRegMask(RegMask) + .addReg(X86::RAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack_allocate_stack_space"); + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) @@ -11926,6 +12366,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); assert(MI->getOperand(3).isGlobal() && "This should be a global"); + // Get a register mask for the lowered call. + // FIXME: The 32-bit calls have non-standard calling conventions. Use a + // proper register mask. + const uint32_t *RegMask = + getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -11936,6 +12381,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); + MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -11946,6 +12392,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) @@ -11956,6 +12403,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, .addReg(0); MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI->eraseFromParent(); // The pseudo instruction is gone now. @@ -11966,30 +12414,14 @@ MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { - default: assert(0 && "Unexpected instr type to insert"); + default: llvm_unreachable("Unexpected instr type to insert"); case X86::TAILJMPd64: case X86::TAILJMPr64: case X86::TAILJMPm64: - assert(0 && "TAILJMP64 would not be touched here."); + llvm_unreachable("TAILJMP64 would not be touched here."); case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: - // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. - // On AMD64, additional defs should be added before register allocation. - if (!Subtarget->isTargetWin64()) { - MI->addRegisterDefined(X86::RSI); - MI->addRegisterDefined(X86::RDI); - MI->addRegisterDefined(X86::XMM6); - MI->addRegisterDefined(X86::XMM7); - MI->addRegisterDefined(X86::XMM8); - MI->addRegisterDefined(X86::XMM9); - MI->addRegisterDefined(X86::XMM10); - MI->addRegisterDefined(X86::XMM11); - MI->addRegisterDefined(X86::XMM12); - MI->addRegisterDefined(X86::XMM13); - MI->addRegisterDefined(X86::XMM14); - MI->addRegisterDefined(X86::XMM15); - } return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); @@ -12294,11 +12726,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, //===----------------------------------------------------------------------===// void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, - const APInt &Mask, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth) const { + unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || @@ -12307,7 +12739,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); - KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. switch (Opc) { default: break; case X86ISD::ADD: @@ -12326,8 +12758,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, break; // Fallthrough case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - 1); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -12339,18 +12770,20 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, case Intrinsic::x86_sse2_movmsk_pd: case Intrinsic::x86_avx_movmsk_pd_256: case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse2_pmovmskb_128: { + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx2_pmovmskb: { // High bits of movmskp{s|d}, pmovmskb are known zero. switch (IntId) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; } - KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), - Mask.getBitWidth() - NumLoBits); + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } @@ -12418,7 +12851,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget* Subtarget) { DebugLoc dl = N->getDebugLoc(); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); @@ -12454,9 +12888,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) return SDValue(); + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, dl, VT, ResNode); + } + // Emit a zeroed vector and insert the desired subvector on its // first half. - SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), DAG.getConstant(0, MVT::i32), DAG, dl); return DCI.CombineTo(N, InsV); @@ -12501,7 +12949,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE) - return PerformShuffleCombine256(N, DAG, DCI); + return PerformShuffleCombine256(N, DAG, DCI, Subtarget); // Only handle 128 wide vector from here on. if (VT.getSizeInBits() != 128) @@ -12517,11 +12965,185 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); } + +/// PerformTruncateCombine - Converts truncate operation to +/// a sequence of vector shuffle operations. +/// It is possible when we truncate 256-bit vector to 128-bit vector + +SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op, + DAG.getIntPtrConstant(2)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // PSHUFD + int ShufMask1[] = {0, 2, 0, 0}; + + OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), + ShufMask1); + OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), + ShufMask1); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2); + } + if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) { + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(0)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op, + DAG.getIntPtrConstant(4)); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi); + + // PSHUFB + int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, + DAG.getUNDEF(MVT::v16i8), + ShufMask1); + + OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi); + + // MOVLHPS + int ShufMask2[] = {0, 1, 4, 5}; + + SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res); + } + + return SDValue(); +} + +/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target +/// specific shuffle of a load can be folded into a single element load. +/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but +/// shuffles have been customed lowered so we need to handle those here. +static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue InVec = N->getOperand(0); + SDValue EltNo = N->getOperand(1); + + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + + EVT VT = InVec.getValueType(); + + bool HasShuffleIntoBitcast = false; + if (InVec.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + EVT BCVT = InVec.getOperand(0).getValueType(); + if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + return SDValue(); + InVec = InVec.getOperand(0); + HasShuffleIntoBitcast = true; + } + + if (!isTargetShuffle(InVec.getOpcode())) + return SDValue(); + + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + + SmallVector<int, 16> ShuffleMask; + bool UnaryShuffle; + if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle)) + return SDValue(); + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = VT.getVectorNumElements(); + int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) + : InVec.getOperand(1); + + // If inputs to shuffle are the same for both ops, then allow 2 uses + unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + + if (LdNode.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) + return SDValue(); + + AllowedUses = 1; // only allow 1 load use if we have a bitcast + LdNode = LdNode.getOperand(0); + } + + if (!ISD::isNormalLoad(LdNode.getNode())) + return SDValue(); + + LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); + + if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + return SDValue(); + + if (HasShuffleIntoBitcast) { + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getTargetData()-> + getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return SDValue(); + } + + // All checks match so transform back to vector_shuffle so that DAG combiner + // can finish the job + DebugLoc dl = N->getDebugLoc(); + + // Create shuffle node taking into account the case that its a unary shuffle + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + InVec.getOperand(0), Shuffle, + &ShuffleMask[0]); + Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, + EltNo); +} + /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts /// to a simple store and scalar loads to extract the elements. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, - const TargetLowering &TLI) { + TargetLowering::DAGCombinerInfo &DCI) { + SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); + if (NewOp.getNode()) + return NewOp; + SDValue InputVector = N->getOperand(0); // Only operate on vectors of 4 elements, where the alternative shuffling @@ -12582,6 +13204,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, unsigned EltSize = InputVector.getValueType().getVectorElementType().getSizeInBits()/8; uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), @@ -12590,7 +13213,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, // Load the scalar. SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, MachinePointerInfo(), - false, false, 0); + false, false, false, 0); // Replace the exact with the load. DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); @@ -12603,7 +13226,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT /// nodes. static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + + DebugLoc DL = N->getDebugLoc(); SDValue Cond = N->getOperand(0); // Get the LHS/RHS of the select. @@ -12617,7 +13243,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && - (Subtarget->hasXMMInt() || + (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -12632,7 +13258,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12642,7 +13268,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMIN; @@ -12660,7 +13286,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) break; Opcode = X86ISD::FMAX; @@ -12670,7 +13296,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; std::swap(LHS, RHS); @@ -12696,7 +13322,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -12706,7 +13332,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; Opcode = X86ISD::FMIN; @@ -12731,7 +13357,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!UnsafeFPMath && + if (!DAG.getTarget().Options.UnsafeFPMath && !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; @@ -12848,6 +13474,57 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Canonicalize max and min: + // (x > y) ? x : y -> (x >= y) ? x : y + // (x < y) ? x : y -> (x <= y) ? x : y + // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates + // the need for an extra compare + // against zero. e.g. + // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 + // subl %esi, %edi + // testl %edi, %edi + // movl $0, %eax + // cmovgl %edi, %eax + // => + // xorl %eax, %eax + // subl %esi, $edi + // cmovsl %eax, %edi + if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGT: { + ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; + Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + } + } + } + + // If we know that this node is legal then we know that it is going to be + // matched by one of the SSE/AVX BLEND instructions. These instructions only + // depend on the highest bit in each word. Try to use SimplifyDemandedBits + // to simplify previous instructions. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && + TLI.isOperationLegal(ISD::VSELECT, VT)) { + unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); } @@ -13042,7 +13719,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. - if (N1C && N0.getOpcode() == ISD::AND && + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY || @@ -13058,26 +13736,46 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { } } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (isSplatVector(N1.getNode())) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0)); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1C && (1 == N1C->getZExtValue())) { + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + } + } + return SDValue(); } /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts /// when possible. static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.isVector() && VT.isInteger() && - N->getOpcode() == ISD::SHL) - return PerformSHLCombine(N, DAG); + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } // On X86 with SSE2 support, we can transform this to a vector shift if // all elements are shifted by the same amount. We can't do this in legalize // because the a constant vector is typically transformed to a constant pool // so we have no knowledge of the shift amount. - if (!Subtarget->hasXMMInt()) + if (!Subtarget->hasSSE2()) return SDValue(); - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasAVX2() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) return SDValue(); SDValue ShAmtOp = N->getOperand(1); @@ -13093,6 +13791,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = Arg; break; } + // Handle the case where the build_vector is all undef + // FIXME: Should DAG allow this? + if (i == NumElts) + return SDValue(); + for (; i != NumElts; ++i) { SDValue Arg = ShAmtOp.getOperand(i); if (Arg.getOpcode() == ISD::UNDEF) continue; @@ -13119,9 +13822,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, BaseShAmt = InVec.getOperand(1); } } - if (BaseShAmt.getNode() == 0) + if (BaseShAmt.getNode() == 0) { + // Don't create instructions with illegal types after legalize + // types has run. + if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) && + !DCI.isBeforeLegalize()) + return SDValue(); + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, DAG.getIntPtrConstant(0)); + } } else return SDValue(); @@ -13136,47 +13846,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift opcode!"); - break; case ISD::SHL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRA: - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v4i32: + case MVT::v8i16: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG); + } case ISD::SRL: - if (VT == MVT::v2i64) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v4i32) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), - ValOp, BaseShAmt); - if (VT == MVT::v8i16) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), - ValOp, BaseShAmt); - break; + switch (VT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i64: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + case MVT::v16i16: + return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG); + } } - return SDValue(); } @@ -13190,7 +13891,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. - if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0->getOperand(1); @@ -13300,7 +14001,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); - // Create ANDN instructions + // Create ANDN, BLSI, and BLSR instructions + // BLSI is X & (-X) + // BLSR is X & (X-1) if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -13313,6 +14016,26 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); + // Check LHS for neg + if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 && + isZero(N0.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N1); + + // Check RHS for neg + if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 && + isZero(N1.getOperand(0))) + return DAG.getNode(X86ISD::BLSI, DL, VT, N0); + + // Check LHS for X-1 + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N1); + + // Check RHS for X-1 + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSR, DL, VT, N0); + return SDValue(); } @@ -13353,98 +14076,87 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return R; EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) - return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // look for psign/blend - if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { - if (VT == MVT::v2i64) { - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) - std::swap(N0, N1); - // or (and (m, x), (pandn m, y)) - if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { - SDValue Mask = N1.getOperand(0); - SDValue X = N1.getOperand(1); - SDValue Y; - if (N0.getOperand(0) == Mask) - Y = N0.getOperand(1); - if (N0.getOperand(1) == Mask) - Y = N0.getOperand(0); - - // Check to see if the mask appeared in both the AND and ANDNP and - if (!Y.getNode()) - return SDValue(); - - // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. - if (Mask.getOpcode() != ISD::BITCAST || - X.getOpcode() != ISD::BITCAST || - Y.getOpcode() != ISD::BITCAST) - return SDValue(); - - // Look through mask bitcast. - Mask = Mask.getOperand(0); - EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. The sra node - // will be an intrinsic. - if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) - return SDValue(); - - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b - switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - break; - default: return SDValue(); - } - - // Check that the SRA is all signbits. - SDValue SraC = Mask.getOperand(2); - unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); - unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); - if ((SraAmt + 1) != EltBits) - return SDValue(); + if (VT == MVT::v2i64 || VT == MVT::v4i64) { + if (!Subtarget->hasSSSE3() || + (VT == MVT::v4i64 && !Subtarget->hasAVX2())) + return SDValue(); - DebugLoc DL = N->getDebugLoc(); + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, y), (pandn m, x)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); - // Now we know we at least have a plendvb with the mask val. See if - // we can form a psignb/w/d. - // psign = x.type == y.type == mask.type && y = sub(0, x); + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + // Look through mask bitcast. + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) Y = Y.getOperand(0); - if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && - ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && - X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ - unsigned Opc = 0; - switch (EltBits) { - case 8: Opc = X86ISD::PSIGNB; break; - case 16: Opc = X86ISD::PSIGNW; break; - case 32: Opc = X86ISD::PSIGND; break; - default: break; - } - if (Opc) { - SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); - } - } - // PBLENDVB only available on SSE 4.1 - if (!(Subtarget->hasSSE41() || Subtarget->hasAVX())) - return SDValue(); - - X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); - Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); - Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y); - return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + if (Mask.getOpcode() != X86ISD::VSRAI) + return SDValue(); + + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(1); + unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Unsupported VT for PSIGN"); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X); + Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getNode(ISD::BITCAST, DL, VT, Mask); } } + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); @@ -13500,6 +14212,36 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + EVT VT = N->getValueType(0); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions"); + + // Create BLSMSK instructions by finding X ^ (X-1) + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 && + isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1); + + if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 && + isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0); + + return SDValue(); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -13515,7 +14257,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // shuffle. We need SSE4 for the shuffles. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + if (RegVT.isVector() && RegVT.isInteger() && + Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -13553,7 +14296,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); // Insert the word loaded into a vector. SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, @@ -13561,7 +14305,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // Bitcast the loaded value to a vector of the original element type, in // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, + ScalarInVector); unsigned SizeRatio = RegSz/MemSz; // Redistribute the loaded elements into the different locations. @@ -13593,7 +14338,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatination of two XMM registers, perform two stores. + // If we are saving a concatenation of two XMM registers, perform two stores. // This is better in Sandy Bridge cause one 256-bit mem op is done via two // 128-bit ones. If in the future the cost becomes only one memory access the // first version would be better. @@ -13703,8 +14448,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const Function *F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps - && Subtarget->hasXMMInt(); + bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps + && Subtarget->hasSSE2(); if ((VT.isVector() || (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && isa<LoadSDNode>(St->getValue()) && @@ -13722,7 +14467,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, Ld = cast<LoadSDNode>(St->getChain()); else if (St->getValue().hasOneUse() && ChainVal->getOpcode() == ISD::TokenFactor) { - for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { if (ChainVal->getOperand(i).getNode() == LdVal) { TokenFactorIndex = i; Ld = cast<LoadSDNode>(St->getValue()); @@ -13749,7 +14494,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), - Ld->isNonTemporal(), Ld->getAlignment()); + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); @@ -13770,10 +14516,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), - Ld->getAlignment()); + Ld->isInvariant(), Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getPointerInfo().getWithOffset(4), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); @@ -13817,7 +14564,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, /// set to A, RHS to B, and the routine returns 'true'. /// Note that the binary operation should have the property that if one of the /// operands is UNDEF then the result is UNDEF. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // Look for the following pattern: if // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > @@ -13833,7 +14580,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { return false; EVT VT = LHS.getValueType(); - unsigned N = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask @@ -13842,34 +14600,36 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // NOTE: in what follows a default initialized SDValue represents an UNDEF of // type VT. SDValue A, B; - SmallVector<int, 8> LMask(N); + SmallVector<int, 16> LMask(NumElts); if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) A = LHS.getOperand(0); if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) B = LHS.getOperand(1); - cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), LMask.begin()); } else { if (LHS.getOpcode() != ISD::UNDEF) A = LHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) LMask[i] = i; } // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector<int, 8> RMask(N); + SmallVector<int, 16> RMask(NumElts); if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) C = RHS.getOperand(0); if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) D = RHS.getOperand(1); - cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), RMask.begin()); } else { if (RHS.getOpcode() != ISD::UNDEF) C = RHS; - for (unsigned i = 0; i != N; ++i) + for (unsigned i = 0; i != NumElts; ++i) RMask[i] = i; } @@ -13884,30 +14644,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { // If A and B occur in reverse order in RHS, then "swap" them (which means // rewriting the mask). if (A != C) - for (unsigned i = 0; i != N; ++i) { - unsigned Idx = RMask[i]; - if (Idx < N) - RMask[i] += N; - else if (Idx < 2*N) - RMask[i] -= N; - } + CommuteVectorShuffleMask(RMask, NumElts); // At this point LHS and RHS are equivalent to // LHS = VECTOR_SHUFFLE A, B, LMask // RHS = VECTOR_SHUFFLE A, B, RMask // Check that the masks correspond to performing a horizontal operation. - for (unsigned i = 0; i != N; ++i) { - unsigned LIdx = LMask[i], RIdx = RMask[i]; + for (unsigned i = 0; i != NumElts; ++i) { + int LIdx = LMask[i], RIdx = RMask[i]; // Ignore any UNDEF components. - if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) - || (!B.getNode() && (LIdx >= N || RIdx >= N))) + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. - if (!(LIdx == 2*i && RIdx == 2*i + 1) && - !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs + unsigned LaneStart = (i/NumLaneElts) * NumLaneElts; + int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) return false; } @@ -13924,8 +14682,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal adds from adds of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, true)) return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -13939,8 +14697,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(1); // Try to synthesize horizontal subs from subs of shuffles. - if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && - (VT == MVT::v4f32 || VT == MVT::v2f64) && + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, false)) return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); return SDValue(); @@ -14006,7 +14764,58 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (!Subtarget->hasAVX()) + return SDValue(); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + EVT OpVT = Op.getValueType(); + DebugLoc dl = N->getDebugLoc(); + + if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) || + (VT == MVT::v8i32 && OpVT == MVT::v8i16)) { + + unsigned NumElems = OpVT.getVectorNumElements(); + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask1.data()); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT), + ShufMask2.data()); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + return SDValue(); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> // (and (i32 x86isd::setcc_carry), 1) // This eliminates the zext. This transformation is necessary because @@ -14014,6 +14823,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { DebugLoc dl = N->getDebugLoc(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT OpVT = N0.getValueType(); + if (N0.getOpcode() == ISD::AND && N0.hasOneUse() && N0.getOperand(0).hasOneUse()) { @@ -14028,6 +14839,37 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { N00.getOperand(0), N00.getOperand(1)), DAG.getConstant(1, VT)); } + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + if (Subtarget->hasAVX()) { + + if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) || + ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) { + + SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl); + SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG); + SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo); + OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); + } + } + return SDValue(); } @@ -14136,7 +14978,24 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(0, OtherVal.getValueType()), NewCmp); } -static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { +/// PerformADDCombine - Do target-specific dag combines on integer adds. +static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -14158,6 +15017,13 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { } } + // Try to synthesize horizontal adds from adds of shuffles. + EVT VT = N->getValueType(0); + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1); + return OptimizeConditionalInDecrement(N, DAG); } @@ -14167,19 +15033,20 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); - case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); - case ISD::SUB: return PerformSubCombine(N, DAG); + case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); + case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); @@ -14190,27 +15057,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); - case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); - case X86ISD::SHUFPS: // Handle all target specific shuffles - case X86ISD::SHUFPD: + case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: - case X86ISD::PUNPCKHBW: - case X86ISD::PUNPCKHWD: - case X86ISD::PUNPCKHDQ: - case X86ISD::PUNPCKHQDQ: - case X86ISD::UNPCKHPS: - case X86ISD::UNPCKHPD: - case X86ISD::VUNPCKHPSY: - case X86ISD::VUNPCKHPDY: - case X86ISD::PUNPCKLBW: - case X86ISD::PUNPCKLWD: - case X86ISD::PUNPCKLDQ: - case X86ISD::PUNPCKLQDQ: - case X86ISD::UNPCKLPS: - case X86ISD::UNPCKLPD: - case X86ISD::VUNPCKLPSY: - case X86ISD::VUNPCKLPDY: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFD: @@ -14218,11 +15072,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILPS: - case X86ISD::VPERMILPSY: - case X86ISD::VPERMILPD: - case X86ISD::VPERMILPDY: - case X86ISD::VPERM2F128: + case X86ISD::VPERMILP: + case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); } @@ -14330,11 +15181,38 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// +namespace { + // Helper to match a string separated by whitespace. + bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) { + s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace. + + for (unsigned i = 0, e = args.size(); i != e; ++i) { + StringRef piece(*args[i]); + if (!s.startswith(piece)) // Check if the piece matches. + return false; + + s = s.substr(piece.size()); + StringRef::size_type pos = s.find_first_not_of(" \t"); + if (pos == 0) // We matched a prefix. + return false; + + s = s.substr(pos); + } + + return s.empty(); + } + const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={}; +} + bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -14342,35 +15220,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = AsmPieces[0]; - AsmPieces.clear(); - SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targeting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical ops - // instead of emitting the bswap asm. For now, we don't support 486 or lower - // so don't worry about this. + // we will turn this bswap into something that will be lowered to logical + // ops instead of emitting the bswap asm. For now, we don't support 486 or + // lower so don't worry about this. // bswap $0 - if (AsmPieces.size() == 2 && - (AsmPieces[0] == "bswap" || - AsmPieces[0] == "bswapq" || - AsmPieces[0] == "bswapl") && - (AsmPieces[1] == "$0" || - AsmPieces[1] == "${0:q}")) { + if (matchAsm(AsmPieces[0], "bswap", "$0") || + matchAsm(AsmPieces[0], "bswapl", "$0") || + matchAsm(AsmPieces[0], "bswapq", "$0") || + matchAsm(AsmPieces[0], "bswap", "${0:q}") || + matchAsm(AsmPieces[0], "bswapl", "${0:q}") || + matchAsm(AsmPieces[0], "bswapq", "${0:q}")) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; return IntrinsicLowering::LowerToByteSwap(CI); } + // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && - AsmPieces.size() == 3 && - (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && - AsmPieces[1] == "$$8," && - AsmPieces[2] == "${0:w}" && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") || + matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) { AsmPieces.clear(); const std::string &ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); @@ -14379,46 +15249,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && - IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && - Words[2] == "$0") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && - Words[2] == "${0:w}") { - AsmPieces.clear(); - const std::string &ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - std::sort(AsmPieces.begin(), AsmPieces.end()); - if (AsmPieces.size() == 4 && - AsmPieces[0] == "~{cc}" && - AsmPieces[1] == "~{dirflag}" && - AsmPieces[2] == "~{flags}" && - AsmPieces[3] == "~{fpsr}") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } - } + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") && + matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") && + matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") + return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { @@ -14427,23 +15277,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { - Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && - Words[2] == "%edx") { - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - } + if (matchAsm(AsmPieces[0], "bswap", "%eax") && + matchAsm(AsmPieces[1], "bswap", "%edx") && + matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx")) + return IntrinsicLowering::LowerToByteSwap(CI); } } break; @@ -14538,7 +15375,8 @@ TargetLowering::ConstraintWeight break; case 'x': case 'Y': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX())) weight = CW_Register; break; case 'I': @@ -14608,9 +15446,9 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasXMMInt()) + if (Subtarget->hasSSE2()) return "Y"; - if (Subtarget->hasXMM()) + if (Subtarget->hasSSE1()) return "x"; } @@ -14816,10 +15654,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (!Subtarget->hasMMX()) break; return std::make_pair(0U, X86::VR64RegisterClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasXMMInt()) break; + if (!Subtarget->hasSSE2()) break; // FALL THROUGH. - case 'x': // SSE_REGS if SSE1 allowed - if (!Subtarget->hasXMM()) break; + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget->hasSSE1()) break; switch (VT.getSimpleVT().SimpleTy) { default: break; @@ -14838,6 +15676,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, case MVT::v4f32: case MVT::v2f64: return std::make_pair(0U, X86::VR128RegisterClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + return std::make_pair(0U, X86::VR256RegisterClass); + } break; } |