summaryrefslogtreecommitdiffstats
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2011-10-20 21:10:27 +0000
committerdim <dim@FreeBSD.org>2011-10-20 21:10:27 +0000
commit7b3392326c40c3c20697816acae597ba7b3144eb (patch)
tree2cbcf22585e99f8a87d12d5ff94f392c0d266819 /lib/Target/X86/X86ISelLowering.cpp
parent1176aa52646fe641a4243a246aa7f960c708a274 (diff)
downloadFreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.zip
FreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.tar.gz
Vendor import of llvm release_30 branch r142614:
http://llvm.org/svn/llvm-project/llvm/branches/release_30@142614
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2940
1 files changed, 2374 insertions, 566 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5096d9a..7c8ce17 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -51,6 +51,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
using namespace llvm;
using namespace dwarf;
@@ -71,9 +72,6 @@ static SDValue Extract128BitVector(SDValue Vec,
SelectionDAG &DAG,
DebugLoc dl);
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG);
-
-
/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
/// sets things up to match to an AVX VEXTRACTF128 instruction or a
/// simple subregister reference. Idx is an index in the 128 bits we
@@ -85,14 +83,10 @@ static SDValue Extract128BitVector(SDValue Vec,
DebugLoc dl) {
EVT VT = Vec.getValueType();
assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
-
EVT ElVT = VT.getVectorElementType();
-
- int Factor = VT.getSizeInBits() / 128;
-
- EVT ResultVT = EVT::getVectorVT(*DAG.getContext(),
- ElVT,
- VT.getVectorNumElements() / Factor);
+ int Factor = VT.getSizeInBits()/128;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
// Extract from UNDEF is UNDEF.
if (Vec.getOpcode() == ISD::UNDEF)
@@ -111,7 +105,6 @@ static SDValue Extract128BitVector(SDValue Vec,
* ElemsPerChunk);
SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
VecIdx);
@@ -136,21 +129,18 @@ static SDValue Insert128BitVector(SDValue Result,
assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
-
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
EVT ResultVT = Result.getValueType();
// Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
// This is the index of the first element of the 128-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
* ElemsPerChunk);
SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
VecIdx);
return Result;
@@ -159,34 +149,6 @@ static SDValue Insert128BitVector(SDValue Result,
return SDValue();
}
-/// Given two vectors, concat them.
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) {
- DebugLoc dl = Lower.getDebugLoc();
-
- assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!");
-
- EVT VT = EVT::getVectorVT(*DAG.getContext(),
- Lower.getValueType().getVectorElementType(),
- Lower.getValueType().getVectorNumElements() * 2);
-
- // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors).
- assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!");
-
- // Insert the upper subvector.
- SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper,
- DAG.getConstant(
- // This is half the length of the result
- // vector. Start inserting the upper 128
- // bits here.
- Lower.getValueType().getVectorNumElements(),
- MVT::i32),
- DAG, dl);
-
- // Insert the lower subvector.
- Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl);
- return Vec;
-}
-
static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
bool is64Bit = Subtarget->is64Bit();
@@ -197,11 +159,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
return new TargetLoweringObjectFileMachO();
}
- if (Subtarget->isTargetELF()) {
- if (is64Bit)
- return new X8664_ELFTargetObjectFile(TM);
- return new X8632_ELFTargetObjectFile(TM);
- }
+ if (Subtarget->isTargetELF())
+ return new TargetLoweringObjectFileELF();
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
return new TargetLoweringObjectFileCOFF();
llvm_unreachable("unknown subtarget type");
@@ -222,6 +181,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// X86 is weird, it always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
@@ -354,7 +315,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
} else if (!UseSoftFloat) {
- if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+ // Since AVX is a superset of SSE3, only check for SSE here.
+ if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
// Expand FP_TO_UINT into a select.
// FIXME: We would like to use a Custom expander here eventually to do
// the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -417,15 +379,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FREM , MVT::f80 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
- setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
- setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
- setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
- if (Subtarget->is64Bit()) {
- setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ if (Subtarget->hasBMI()) {
+ setOperationAction(ISD::CTTZ , MVT::i8 , Promote);
+ } else {
+ setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasLZCNT()) {
+ setOperationAction(ISD::CTLZ , MVT::i8 , Promote);
+ } else {
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
}
if (Subtarget->hasPOPCNT()) {
@@ -491,8 +462,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (Subtarget->hasXMM())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
- // We may not have a libcall for MEMBARRIER so we should lower this.
setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// On X86 and X86-64, atomic operations are lowered to locked instructions.
// Locked instructions, in turn, have implicit fence semantics (all memory
@@ -506,9 +477,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
MVT VT = IntVTs[i];
setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
@@ -518,6 +491,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
}
+ if (Subtarget->hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+ }
+
// FIXME - use subtarget debug flags
if (!Subtarget->isTargetDarwin() &&
!Subtarget->isTargetELF() &&
@@ -539,7 +516,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
- setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
@@ -556,11 +534,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::DYNAMIC_STACKALLOC,
- (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
- (Subtarget->isTargetCOFF()
- && !Subtarget->isTargetEnvMacho()
- ? Custom : Expand));
+
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else if (EnableSegmentedStacks)
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Expand);
if (!UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
@@ -739,7 +722,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
@@ -754,6 +737,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand);
for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
setTruncStoreAction((MVT::SimpleValueType)VT,
@@ -816,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
}
if (!UseSoftFloat && Subtarget->hasXMMInt()) {
@@ -846,10 +830,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
- setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
- setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
- setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
- setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
@@ -925,7 +909,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
}
- if (Subtarget->hasSSE41()) {
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
@@ -944,6 +928,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
// i8 and i16 vectors are custom , because the source register and source
// source memory operand types are not the same width. f32 vectors are
// custom since the immediate controlling the insert encodes additional
@@ -964,10 +954,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
}
- if (Subtarget->hasSSE2()) {
+ if (Subtarget->hasXMMInt()) {
setOperationAction(ISD::SRL, MVT::v2i64, Custom);
setOperationAction(ISD::SRL, MVT::v4i32, Custom);
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i16, Custom);
setOperationAction(ISD::SHL, MVT::v2i64, Custom);
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
@@ -977,15 +968,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRA, MVT::v8i16, Custom);
}
- if (Subtarget->hasSSE42())
- setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
+ if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
if (!UseSoftFloat && Subtarget->hasAVX()) {
- addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
@@ -1005,6 +997,59 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, Custom);
+ // Don't lower v32i8 because there is no 128-bit byte mul
+
// Custom lower several nodes for 256-bit types.
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1093,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
@@ -1100,7 +1146,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::SUB);
+ setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -1124,25 +1173,26 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
-MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
- return MVT::i8;
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+ if (!VT.isVector()) return MVT::i8;
+ return VT.changeVectorElementTypeToInteger();
}
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
-static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
return;
- if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (VTy->getBitWidth() == 128)
MaxAlign = 16;
- } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
unsigned EltAlign = 0;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
- } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
unsigned EltAlign = 0;
getMaxByValAlign(STy->getElementType(i), EltAlign);
@@ -1159,7 +1209,7 @@ static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
if (Subtarget->is64Bit()) {
// Max of 8 and alignment of type.
unsigned TyAlign = TD->getABITypeAlignment(Ty);
@@ -1203,9 +1253,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16))) &&
Subtarget->getStackAlignment() >= 16) {
- if (Subtarget->hasSSE2())
+ if (Subtarget->hasAVX() &&
+ Subtarget->getStackAlignment() >= 32)
+ return MVT::v8f32;
+ if (Subtarget->hasXMMInt())
return MVT::v4i32;
- if (Subtarget->hasSSE1())
+ if (Subtarget->hasXMM())
return MVT::v4f32;
} else if (!MemcpyStrSrc && Size >= 8 &&
!Subtarget->is64Bit() &&
@@ -1408,7 +1461,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
- if (!Subtarget->hasSSE2())
+ if (!Subtarget->hasXMMInt())
ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
}
}
@@ -1700,6 +1753,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
// places.
assert(VA.getValNo() != LastVal &&
"Don't support value assigned to multiple locs yet");
+ (void)LastVal;
LastVal = VA.getValNo();
if (VA.isRegLoc()) {
@@ -1917,6 +1971,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
+ FuncInfo->setArgumentStackSize(StackSize);
+
return Chain;
}
@@ -2744,8 +2800,6 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::MOVSD:
case X86ISD::UNPCKLPS:
case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPS:
- case X86ISD::VUNPCKLPD:
case X86ISD::VUNPCKLPSY:
case X86ISD::VUNPCKLPDY:
case X86ISD::PUNPCKLWD:
@@ -2754,10 +2808,17 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PUNPCKLQDQ:
case X86ISD::UNPCKHPS:
case X86ISD::UNPCKHPD:
+ case X86ISD::VUNPCKHPSY:
+ case X86ISD::VUNPCKHPDY:
case X86ISD::PUNPCKHWD:
case X86ISD::PUNPCKHBW:
case X86ISD::PUNPCKHDQ:
case X86ISD::PUNPCKHQDQ:
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
+ case X86ISD::VPERM2F128:
return true;
}
return false;
@@ -2783,6 +2844,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
@@ -2796,6 +2861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::PALIGN:
case X86ISD::SHUFPD:
case X86ISD::SHUFPS:
+ case X86ISD::VPERM2F128:
return DAG.getNode(Opc, dl, VT, V1, V2,
DAG.getConstant(TargetMask, MVT::i8));
}
@@ -2815,8 +2881,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::MOVSD:
case X86ISD::UNPCKLPS:
case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPS:
- case X86ISD::VUNPCKLPD:
case X86ISD::VUNPCKLPSY:
case X86ISD::VUNPCKLPDY:
case X86ISD::PUNPCKLWD:
@@ -2825,6 +2889,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
case X86ISD::PUNPCKLQDQ:
case X86ISD::UNPCKHPS:
case X86ISD::UNPCKHPD:
+ case X86ISD::VUNPCKHPSY:
+ case X86ISD::VUNPCKHPDY:
case X86ISD::PUNPCKHWD:
case X86ISD::PUNPCKHBW:
case X86ISD::PUNPCKHDQ:
@@ -3026,6 +3092,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
return (Val < 0) || (Val >= Low && Val < Hi);
}
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+ int Pos, int Size, int Low, int Hi) {
+ for (int i = Pos, e = Pos+Size; i != e; ++i)
+ if (!isUndefOrInRange(Mask[i], Low, Hi))
+ return false;
+ return true;
+}
+
/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3034,6 +3111,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
return false;
}
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+ int Pos, int Size, int Low) {
+ for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
/// the second operand.
@@ -3104,11 +3192,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PALIGNR.
static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool hasSSSE3) {
+ bool hasSSSE3OrAVX) {
int i, e = VT.getVectorNumElements();
+ if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+ return false;
// Do not handle v2i64 / v2f64 shuffles with palignr.
- if (e < 4 || !hasSSSE3)
+ if (e < 4 || !hasSSSE3OrAVX)
return false;
for (i = 0; i != e; ++i)
@@ -3119,42 +3209,176 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
if (i == e)
return false;
- // Determine if it's ok to perform a palignr with only the LHS, since we
- // don't have access to the actual shuffle elements to see if RHS is undef.
- bool Unary = Mask[i] < (int)e;
- bool NeedsUnary = false;
+ // Make sure we're shifting in the right direction.
+ if (Mask[i] <= i)
+ return false;
int s = Mask[i] - i;
// Check the rest of the elements to see if they are consecutive.
for (++i; i != e; ++i) {
int m = Mask[i];
- if (m < 0)
- continue;
+ if (m >= 0 && m != s+i)
+ return false;
+ }
+ return true;
+}
+
+/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPSY.
+static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ if (NumElems != 8)
+ return false;
- Unary = Unary && (m < (int)e);
- NeedsUnary = NeedsUnary || (m < s);
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
+ // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
+ //
+ // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
+ // Y3..Y0, Y3..Y0, X3..X0, X3..X0
+ //
+ int QuarterSize = NumElems/4;
+ int HalfSize = QuarterSize*2;
+ for (int i = 0; i < QuarterSize; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+ return false;
+ for (int i = QuarterSize; i < QuarterSize*2; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+ return false;
- if (NeedsUnary && !Unary)
+ // The mask of the second half must be the same as the first but with
+ // the appropriate offsets. This works in the same way as VPERMILPS
+ // works with masks.
+ for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
+ if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+ return false;
+ int FstHalfIdx = i-HalfSize;
+ if (Mask[FstHalfIdx] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
return false;
- if (Unary && m != ((s+i) & (e-1)))
+ }
+ for (int i = QuarterSize*3; i < NumElems; ++i) {
+ if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
return false;
- if (!Unary && m != (s+i))
+ int FstHalfIdx = i-HalfSize;
+ if (Mask[FstHalfIdx] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
return false;
+
}
+
return true;
}
-bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isPALIGNRMask(M, N->getValueType(0), true);
+/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
+static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
+ "Only supports v8i32 and v8f32 types");
+
+ int HalfSize = NumElems/2;
+ unsigned Mask = 0;
+ for (int i = 0; i != NumElems ; ++i) {
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ // The mask of the first half must be equal to the second one.
+ unsigned Shamt = (i%HalfSize)*2;
+ unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
+ Mask |= Elt << Shamt;
+ }
+
+ return Mask;
+}
+
+/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
+/// version and the mask of the second half isn't binded with the first
+/// one.
+static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ if (NumElems != 4)
+ return false;
+
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X3 X2 X1 X0
+ // SRC2 => Y3 Y2 Y1 Y0
+ //
+ // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0
+ //
+ int QuarterSize = NumElems/4;
+ int HalfSize = QuarterSize*2;
+ for (int i = 0; i < QuarterSize; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+ return false;
+ for (int i = QuarterSize; i < QuarterSize*2; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+ return false;
+ for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
+ if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+ return false;
+ for (int i = QuarterSize*3; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
+ return false;
+
+ return true;
+}
+
+/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
+static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
+ "Only supports v4i64 and v4f64 types");
+
+ int HalfSize = NumElems/2;
+ unsigned Mask = 0;
+ for (int i = 0; i != NumElems ; ++i) {
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ int Elt = SVOp->getMaskElt(i) % HalfSize;
+ Mask |= Elt << i;
+ }
+
+ return Mask;
}
/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// SHUFPS and SHUFPD.
static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
int NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+
if (NumElems != 2 && NumElems != 4)
return false;
@@ -3204,7 +3428,13 @@ static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
- if (N->getValueType(0).getVectorNumElements() != 4)
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+
+ if (NumElems != 4)
return false;
// Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
@@ -3218,15 +3448,19 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
if (NumElems != 4)
return false;
return isUndefOrEqual(N->getMaskElt(0), 2) &&
- isUndefOrEqual(N->getMaskElt(1), 3) &&
- isUndefOrEqual(N->getMaskElt(2), 2) &&
- isUndefOrEqual(N->getMaskElt(3), 3);
+ isUndefOrEqual(N->getMaskElt(1), 3) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
}
/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3273,20 +3507,22 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
bool V2IsSplat = false) {
int NumElts = VT.getVectorNumElements();
- if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
return false;
- // Handle vector lengths > 128 bits. Define a "section" as a set of
- // 128 bits. AVX defines UNPCK* to operate independently on 128-bit
- // sections.
- unsigned NumSections = VT.getSizeInBits() / 128;
- if (NumSections == 0 ) NumSections = 1; // Handle MMX
- unsigned NumSectionElts = NumElts / NumSections;
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
unsigned Start = 0;
- unsigned End = NumSectionElts;
- for (unsigned s = 0; s < NumSections; ++s) {
- for (unsigned i = Start, j = s * NumSectionElts;
+ unsigned End = NumLaneElts;
+ for (unsigned s = 0; s < NumLanes; ++s) {
+ for (unsigned i = Start, j = s * NumLaneElts;
i != End;
i += 2, ++j) {
int BitI = Mask[i];
@@ -3302,8 +3538,8 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
}
}
// Process the next 128 bits.
- Start += NumSectionElts;
- End += NumSectionElts;
+ Start += NumLaneElts;
+ End += NumLaneElts;
}
return true;
@@ -3320,21 +3556,38 @@ bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
bool V2IsSplat = false) {
int NumElts = VT.getVectorNumElements();
- if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Unsupported vector type for unpckh");
+
+ if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
return false;
- for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
- int BitI = Mask[i];
- int BitI1 = Mask[i+1];
- if (!isUndefOrEqual(BitI, j + NumElts/2))
- return false;
- if (V2IsSplat) {
- if (isUndefOrEqual(BitI1, NumElts))
- return false;
- } else {
- if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLaneElts = NumElts/NumLanes;
+
+ unsigned Start = 0;
+ unsigned End = NumLaneElts;
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
+ i != End; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j+NumElts))
+ return false;
+ }
}
+ // Process the next 128 bits.
+ Start += NumLaneElts;
+ End += NumLaneElts;
}
return true;
}
@@ -3353,16 +3606,21 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
return false;
- // Handle vector lengths > 128 bits. Define a "section" as a set of
- // 128 bits. AVX defines UNPCK* to operate independently on 128-bit
- // sections.
- unsigned NumSections = VT.getSizeInBits() / 128;
- if (NumSections == 0 ) NumSections = 1; // Handle MMX
- unsigned NumSectionElts = NumElems / NumSections;
+ // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+ // FIXME: Need a better way to get rid of this, there's no latency difference
+ // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+ // the former later. We should also remove the "_undef" special mask.
+ if (NumElems == 4 && VT.getSizeInBits() == 256)
+ return false;
+
+ // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+ // independently on 128-bit lanes.
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElems / NumLanes;
- for (unsigned s = 0; s < NumSections; ++s) {
- for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
- i != NumSectionElts * (s + 1);
+ for (unsigned s = 0; s < NumLanes; ++s) {
+ for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
+ i != NumLaneElts * (s + 1);
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
@@ -3433,6 +3691,189 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
return ::isMOVLMask(M, N->getValueType(0));
}
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ // The shuffle result is divided into half A and half B. In total the two
+ // sources have 4 halves, namely: C, D, E, F. The final values of A and
+ // B must come from C, D, E or F.
+ int HalfSize = VT.getVectorNumElements()/2;
+ bool MatchA = false, MatchB = false;
+
+ // Check if A comes from one of C, D, E, F.
+ for (int Half = 0; Half < 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+ MatchA = true;
+ break;
+ }
+ }
+
+ // Check if B comes from one of C, D, E, F.
+ for (int Half = 0; Half < 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+ MatchB = true;
+ break;
+ }
+ }
+
+ return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int HalfSize = VT.getVectorNumElements()/2;
+
+ int FstHalf = 0, SndHalf = 0;
+ for (int i = 0; i < HalfSize; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ FstHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+ for (int i = HalfSize; i < HalfSize*2; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ SndHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+
+ return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+
+ if (!Subtarget->hasAVX())
+ return false;
+
+ // Only match 256-bit with 64-bit types
+ if (VT.getSizeInBits() != 256 || NumElts != 4)
+ return false;
+
+ // The mask on the high lane is independent of the low. Both can match
+ // any element in inside its own lane, but can't cross.
+ int LaneSize = NumElts/NumLanes;
+ for (int l = 0; l < NumLanes; ++l)
+ for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+ int LaneStart = l*LaneSize;
+ if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+ return false;
+ }
+
+ return true;
+}
+
+/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+
+ if (!Subtarget->hasAVX())
+ return false;
+
+ // Only match 256-bit with 32-bit types
+ if (VT.getSizeInBits() != 256 || NumElts != 8)
+ return false;
+
+ // The mask on the high lane should be the same as the low. Actually,
+ // they can differ if any of the corresponding index in a lane is undef
+ // and the other stays in range.
+ int LaneSize = NumElts/NumLanes;
+ for (int i = 0; i < LaneSize; ++i) {
+ int HighElt = i+LaneSize;
+ bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
+ bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
+
+ if (!HighValid || !LowValid)
+ return false;
+ if (Mask[i] < 0 || Mask[HighElt] < 0)
+ continue;
+ if (Mask[HighElt]-Mask[i] != LaneSize)
+ return false;
+ }
+
+ return true;
+}
+
+/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
+static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+ int LaneSize = NumElts/NumLanes;
+
+ // Although the mask is equal for both lanes do it twice to get the cases
+ // where a mask will match because the same mask element is undef on the
+ // first half but valid on the second. This would get pathological cases
+ // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+ unsigned Mask = 0;
+ for (int l = 0; l < NumLanes; ++l) {
+ for (int i = 0; i < LaneSize; ++i) {
+ int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
+ if (MaskElt < 0)
+ continue;
+ if (MaskElt >= LaneSize)
+ MaskElt -= LaneSize;
+ Mask |= MaskElt << (i*2);
+ }
+ }
+
+ return Mask;
+}
+
+/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
+static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+
+ unsigned Mask = 0;
+ int LaneSize = NumElts/NumLanes;
+ for (int l = 0; l < NumLanes; ++l)
+ for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+ int MaskElt = SVOp->getMaskElt(i);
+ if (MaskElt < 0)
+ continue;
+ Mask |= (MaskElt-l*LaneSize) << i;
+ }
+
+ return Mask;
+}
+
/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
@@ -3463,58 +3904,92 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
- if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
return false;
- // Expect 1, 1, 3, 3
- for (unsigned i = 0; i < 2; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt >= 0 && Elt != 1)
- return false;
- }
+ // The second vector must be undef
+ if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+ return false;
- bool HasHi = false;
- for (unsigned i = 2; i < 4; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt >= 0 && Elt != 3)
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+ (VT.getSizeInBits() == 256 && NumElems != 8))
+ return false;
+
+ // "i+1" is the value the indexed mask element must have
+ for (unsigned i = 0; i < NumElems; i += 2)
+ if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
+ !isUndefOrEqual(N->getMaskElt(i+1), i+1))
return false;
- if (Elt == 3)
- HasHi = true;
- }
- // Don't use movshdup if it can be done with a shufps.
- // FIXME: verify that matching u, u, 3, 3 is what we want.
- return HasHi;
+
+ return true;
}
/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
- if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+ return false;
+
+ // The second vector must be undef
+ if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+ return false;
+
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+ (VT.getSizeInBits() == 256 && NumElems != 8))
return false;
- // Expect 0, 0, 2, 2
- for (unsigned i = 0; i < 2; ++i)
- if (N->getMaskElt(i) > 0)
+ // "i" is the value the indexed mask element must have
+ for (unsigned i = 0; i < NumElems; i += 2)
+ if (!isUndefOrEqual(N->getMaskElt(i), i) ||
+ !isUndefOrEqual(N->getMaskElt(i+1), i))
return false;
- bool HasHi = false;
- for (unsigned i = 2; i < 4; ++i) {
- int Elt = N->getMaskElt(i);
- if (Elt >= 0 && Elt != 2)
+ return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ int NumElts = VT.getVectorNumElements();
+ bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
+ !V2IsUndef || NumElts != 4)
+ return false;
+
+ for (int i = 0; i != NumElts/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), 0))
return false;
- if (Elt == 2)
- HasHi = true;
- }
- // Don't use movsldup if it can be done with a shufps.
- return HasHi;
+ for (int i = NumElts/2; i != NumElts; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+ return false;
+ return true;
}
/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
- int e = N->getValueType(0).getVectorNumElements() / 2;
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+ int e = VT.getVectorNumElements() / 2;
for (int i = 0; i < e; ++i)
if (!isUndefOrEqual(N->getMaskElt(i), i))
return false;
@@ -3627,6 +4102,7 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
if (Val >= 0)
break;
}
+ assert(Val - i > 0 && "PALIGNR imm should be positive");
return (Val - i) * EltSize;
}
@@ -3644,7 +4120,6 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
EVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
return Index / NumElemsPerChunk;
}
@@ -3662,7 +4137,6 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
EVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
return Index / NumElemsPerChunk;
}
@@ -3716,7 +4190,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
- if (Op->getValueType(0).getVectorNumElements() != 4)
+ EVT VT = Op->getValueType(0);
+ if (VT.getSizeInBits() != 128)
+ return false;
+ if (VT.getVectorNumElements() != 4)
return false;
for (unsigned i = 0, e = 2; i != e; ++i)
if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
@@ -3748,6 +4225,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
ShuffleVectorSDNode *Op) {
+ EVT VT = Op->getValueType(0);
+ if (VT.getSizeInBits() != 128)
+ return false;
+
if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
return false;
// Is V2 is a vector load, don't do this transformation. We will try to use
@@ -3755,7 +4236,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
if (ISD::isNON_EXTLoad(V2))
return false;
- unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
if (NumElems != 2 && NumElems != 4)
return false;
@@ -3811,7 +4292,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
@@ -3819,7 +4300,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
if (VT.getSizeInBits() == 128) { // SSE
- if (HasSSE2) { // SSE2
+ if (HasXMMInt) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
@@ -3838,21 +4319,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
}
/// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
-/// their original type, ensuring they get CSE'd.
+/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
+/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
+/// original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
assert((VT.is128BitVector() || VT.is256BitVector())
&& "Expected a 128-bit or 256-bit vector type");
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+ Cst, Cst, Cst, Cst);
- SDValue Vec;
if (VT.is256BitVector()) {
- SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
- } else
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+ Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+ Vec = Insert128BitVector(InsV, Vec,
+ DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+ }
+
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
@@ -3902,7 +4387,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
@@ -3915,31 +4400,95 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
- EVT PVT = MVT::v4f32;
- EVT VT = SV->getValueType(0);
- DebugLoc dl = SV->getDebugLoc();
- SDValue V1 = SV->getOperand(0);
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+ EVT VT = V.getValueType();
int NumElems = VT.getVectorNumElements();
- int EltNo = SV->getSplatIndex();
+ DebugLoc dl = V.getDebugLoc();
- // unpack elements to the correct location
while (NumElems > 4) {
if (EltNo < NumElems/2) {
- V1 = getUnpackl(DAG, dl, VT, V1, V1);
+ V = getUnpackl(DAG, dl, VT, V, V);
} else {
- V1 = getUnpackh(DAG, dl, VT, V1, V1);
+ V = getUnpackh(DAG, dl, VT, V, V);
EltNo -= NumElems/2;
}
NumElems >>= 1;
}
+ return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+ EVT VT = V.getValueType();
+ DebugLoc dl = V.getDebugLoc();
+ assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ && "Vector size not supported");
+
+ if (VT.getSizeInBits() == 128) {
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+ int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+ V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+ &SplatMask[0]);
+ } else {
+ // To use VPERMILPS to splat scalars, the second half of indicies must
+ // refer to the higher part, which is a duplication of the lower one,
+ // because VPERMILPS can only handle in-lane permutations.
+ int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+ EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+ V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+ &SplatMask[0]);
+ }
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+ EVT SrcVT = SV->getValueType(0);
+ SDValue V1 = SV->getOperand(0);
+ DebugLoc dl = SV->getDebugLoc();
+
+ int EltNo = SV->getSplatIndex();
+ int NumElems = SrcVT.getVectorNumElements();
+ unsigned Size = SrcVT.getSizeInBits();
+
+ assert(((Size == 128 && NumElems > 4) || Size == 256) &&
+ "Unknown how to promote splat for type");
+
+ // Extract the 128-bit part containing the splat element and update
+ // the splat element index when it refers to the higher register.
+ if (Size == 256) {
+ unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+ V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
+ if (Idx > 0)
+ EltNo -= NumElems/2;
+ }
+
+ // All i16 and i8 vector types can't be used directly by a generic shuffle
+ // instruction because the target has no such instruction. Generate shuffles
+ // which repeat i16 and i8 several times until they fit in i32, and then can
+ // be manipulated by target suported shuffles.
+ EVT EltVT = SrcVT.getVectorElementType();
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+ // Recreate the 256-bit vector and place the same 128-bit vector
+ // into the low and high part. This is necessary because we want
+ // to use VPERM* to shuffle the vectors
+ if (Size == 256) {
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ V1 = Insert128BitVector(InsV, V1,
+ DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ }
- // Perform the splat.
- int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
- V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
- return DAG.getNode(ISD::BITCAST, dl, VT, V1);
+ return getLegalSplat(DAG, V1, EltNo);
}
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
@@ -3947,11 +4496,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
- bool isZero, bool HasSSE2,
- SelectionDAG &DAG) {
+ bool isZero, bool HasXMMInt,
+ SelectionDAG &DAG) {
EVT VT = V2.getValueType();
SDValue V1 = isZero
- ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec;
for (unsigned i = 0; i != NumElems; ++i)
@@ -4005,6 +4554,8 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
break;
case X86ISD::UNPCKHPS:
case X86ISD::UNPCKHPD:
+ case X86ISD::VUNPCKHPSY:
+ case X86ISD::VUNPCKHPDY:
DecodeUNPCKHPMask(NumElems, ShuffleMask);
break;
case X86ISD::PUNPCKLBW:
@@ -4015,8 +4566,6 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
break;
case X86ISD::UNPCKLPS:
case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPS:
- case X86ISD::VUNPCKLPD:
case X86ISD::VUNPCKLPSY:
case X86ISD::VUNPCKLPDY:
DecodeUNPCKLPMask(VT, ShuffleMask);
@@ -4052,8 +4601,41 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
+ case X86ISD::VPERMILPS:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERMILPSY:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERMILPD:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERMILPDY:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERM2F128:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::PALIGN:
+ return SDValue(); // Not yet implemented.
default:
- assert("not implemented for target shuffle node");
+ assert(0 && "unknown target shuffle node");
return SDValue();
}
@@ -4205,6 +4787,11 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
/// logical left or right shift of a vector.
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ // Although the logic below support any bitwidth size, there are no
+ // shift instructions which handle more than 128-bit vectors.
+ if (SVOp->getValueType(0).getSizeInBits() > 128)
+ return false;
+
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
return true;
@@ -4295,6 +4882,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, DebugLoc dl) {
+ assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
EVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4333,42 +4921,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
return SDValue();
}
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ unsigned RequiredAlign = VT.getSizeInBits()/8;
SDValue Chain = LD->getChain();
- // Make sure the stack object alignment is at least 16.
+ // Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- if (DAG.InferPtrAlignment(Ptr) < 16) {
+ if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
if (MFI->isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
- MFI->setObjectAlignment(FI, 16);
+ MFI->setObjectAlignment(FI, RequiredAlign);
}
}
- // (Offset % 16) must be multiple of 4. Then address is then
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
- if ((Offset % 16) & 3)
+ if ((Offset % RequiredAlign) & 3)
return SDValue();
- int64_t StartOffset = Offset & ~15;
+ int64_t StartOffset = Offset & ~(RequiredAlign-1);
if (StartOffset)
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
- int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
- EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
- SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
+ int NumElems = VT.getVectorNumElements();
+
+ EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, 0);
- // Canonicalize it to a v4i32 shuffle.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(MVT::v4i32, dl, V1,
- DAG.getUNDEF(MVT::v4i32),&Mask[0]));
+
+ // Canonicalize it to a v4i32 or v8i32 shuffle.
+ SmallVector<int, 8> Mask;
+ for (int i = 0; i < NumElems; ++i)
+ Mask.push_back(EltNo);
+
+ V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
+ return DAG.getNode(ISD::BITCAST, dl, NVT,
+ DAG.getVectorShuffle(CanonVT, dl, V1,
+ DAG.getUNDEF(CanonVT),&Mask[0]));
}
return SDValue();
@@ -4428,12 +5026,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
LDBase->getPointerInfo(),
LDBase->isVolatile(), LDBase->isNonTemporal(),
LDBase->getAlignment());
- } else if (NumElems == 4 && LastLoadedElt == 1) {
+ } else if (NumElems == 4 && LastLoadedElt == 1 &&
+ DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
- SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
- Ops, 2, MVT::i32,
- LDBase->getMemOperand());
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
}
return SDValue();
@@ -4445,47 +5047,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
EVT ExtVT = VT.getVectorElementType();
-
unsigned NumElems = Op.getNumOperands();
- // For AVX-length vectors, build the individual 128-bit pieces and
- // use shuffles to put them in place.
- if (VT.getSizeInBits() > 256 &&
- Subtarget->hasAVX() &&
- !ISD::isBuildVectorAllZeros(Op.getNode())) {
- SmallVector<SDValue, 8> V;
- V.resize(NumElems);
- for (unsigned i = 0; i < NumElems; ++i) {
- V[i] = Op.getOperand(i);
- }
-
- EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
-
- // Build the lower subvector.
- SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
- // Build the upper subvector.
- SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
- NumElems/2);
+ // Vectors containing all zeros can be matched by pxor and xorps later
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+ // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+ if (Op.getValueType() == MVT::v4i32 ||
+ Op.getValueType() == MVT::v8i32)
+ return Op;
- return ConcatVectors(Lower, Upper, DAG);
+ return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
}
- // All zero's:
- // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
- // All one's:
- // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
- if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
- ISD::isBuildVectorAllOnes(Op.getNode())) {
- // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
- // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
- // eliminated on x86-32 hosts.
- if (Op.getValueType() == MVT::v4i32 ||
- Op.getValueType() == MVT::v8i32)
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors.
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (Op.getValueType() == MVT::v4i32)
return Op;
- if (ISD::isBuildVectorAllOnes(Op.getNode()))
- return getOnesVector(Op.getValueType(), DAG, dl);
- return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+ return getOnesVector(Op.getValueType(), DAG, dl);
}
unsigned EVTBits = ExtVT.getSizeInBits();
@@ -4538,7 +5119,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
@@ -4566,7 +5147,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
- return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+ return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
DAG);
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
@@ -4574,7 +5155,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
EVT MiddleVT = MVT::v4i32;
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
}
}
@@ -4603,7 +5184,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(i == Idx ? 0 : 1);
@@ -4631,6 +5212,27 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
+ // For AVX-length vectors, build the individual 128-bit pieces and use
+ // shuffles to put them in place.
+ if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+ SmallVector<SDValue, 32> V;
+ for (unsigned i = 0; i < NumElems; ++i)
+ V.push_back(Op.getOperand(i));
+
+ EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+ // Build both the lower and upper subvector.
+ SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
+ SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
+ NumElems/2);
+
+ // Recreate the wider vector with the lower and upper part.
+ SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
// Let legalizer expand 2-wide build_vectors.
if (EVTBits == 64) {
if (NumNonZero == 1) {
@@ -4639,7 +5241,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
}
return SDValue();
}
@@ -4664,7 +5266,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
- V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
else
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
@@ -4708,7 +5310,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -4758,13 +5360,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
- // We support concatenate two MMX registers and place them in a MMX
- // register. This is better than doing a stack convert.
+// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
+// them in a MMX register. This is better than doing a stack convert.
+static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
EVT ResVT = Op.getValueType();
- assert(Op.getNumOperands() == 2);
+
assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
int Mask[2];
@@ -4785,6 +5386,42 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
}
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ EVT ResVT = Op.getValueType();
+
+ assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+ EVT ResVT = Op.getValueType();
+
+ assert(Op.getNumOperands() == 2);
+ assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
+ "Unsupported CONCAT_VECTORS for value type");
+
+ // We support concatenate two MMX registers and place them in a MMX register.
+ // This is better than doing a stack convert.
+ if (ResVT.is128BitVector())
+ return LowerMMXCONCAT_VECTORS(Op, DAG);
+
+ // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+ return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
@@ -4844,7 +5481,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -4917,7 +5554,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
@@ -4985,7 +5622,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5013,7 +5651,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5079,7 +5718,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (TLI.getSubtarget()->hasSSSE3()) {
+ if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
@@ -5276,15 +5915,109 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
OpVT, SrcOp)));
}
-/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
-/// shuffles.
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+ int HalfSize = NumElems/2;
+ SmallVector<int, 16> M;
+ SVOp->getMask(M);
+ bool MatchA = false, MatchB = false;
+
+ for (int l = 0; l < NumElems*2; l += HalfSize) {
+ if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+ MatchA = true;
+ break;
+ }
+ }
+
+ for (int l = 0; l < NumElems*2; l += HalfSize) {
+ if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+ MatchB = true;
+ break;
+ }
+ }
+
+ return MatchA && MatchB;
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+ // If each half of a vector shuffle node referes to only one lane in the
+ // source vectors, extract each used 128-bit lane and shuffle them using
+ // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+ // the work to the legalizer.
+ DebugLoc dl = SVOp->getDebugLoc();
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+ int HalfSize = NumElems/2;
+
+ // Extract the reference for each half
+ int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+ int FstVecOpNum = 0, SndVecOpNum = 0;
+ for (int i = 0; i < HalfSize; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ FstVecOpNum = Elt/NumElems;
+ FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+ break;
+ }
+ for (int i = HalfSize; i < NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ SndVecOpNum = Elt/NumElems;
+ SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+ break;
+ }
+
+ // Extract the subvectors
+ SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+ DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+ SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+ DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+ // Generate 128-bit shuffles
+ SmallVector<int, 16> MaskV1, MaskV2;
+ for (int i = 0; i < HalfSize; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ }
+ for (int i = HalfSize; i < NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ }
+
+ EVT NVT = V1.getValueType();
+ V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+ V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+ // Concatenate the result back
+ SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
+ return SDValue();
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
static SDValue
-LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
EVT VT = SVOp->getValueType(0);
+ assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+
SmallVector<std::pair<int, int>, 8> Locs;
Locs.resize(4);
SmallVector<int, 8> Mask1(4U, -1);
@@ -5542,18 +6275,21 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
static
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
- bool HasSSE2) {
+ bool HasXMMInt) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
assert(VT != MVT::v2i64 && "unsupported shuffle type");
- if (HasSSE2 && VT == MVT::v2f64)
+ if (HasXMMInt && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
- // v4f32 or v4i32
- return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
+ // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
}
static
@@ -5572,8 +6308,24 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
}
+static inline unsigned getSHUFPOpcode(EVT VT) {
+ switch(VT.getSimpleVT().SimpleTy) {
+ case MVT::v8i32: // Use fp unit for int unpack.
+ case MVT::v8f32:
+ case MVT::v4i32: // Use fp unit for int unpack.
+ case MVT::v4f32: return X86ISD::SHUFPS;
+ case MVT::v4i64: // Use fp unit for int unpack.
+ case MVT::v4f64:
+ case MVT::v2i64: // Use fp unit for int unpack.
+ case MVT::v2f64: return X86ISD::SHUFPD;
+ default:
+ llvm_unreachable("Unknown type for shufp*");
+ }
+ return 0;
+}
+
static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
@@ -5602,7 +6354,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
CanFoldLoad = false;
if (CanFoldLoad) {
- if (HasSSE2 && NumElems == 2)
+ if (HasXMMInt && NumElems == 2)
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
@@ -5616,28 +6368,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
// this is horrible, but will stay like this until we move all shuffle
// matching to x86 specific nodes. Note that for the 1st condition all
// types are matched with movsd.
- if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
- else if (HasSSE2)
+ if (HasXMMInt) {
+ // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+ // as to remove this logic from here, as much as possible
+ if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+ return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-
+ }
assert(VT != MVT::v4i32 && "unsupported shuffle type");
// Invert the operand order and use SHUFPS to match it.
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
X86::getShuffleSHUFImmediate(SVOp), DAG);
}
-static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) {
+static inline unsigned getUNPCKLOpcode(EVT VT) {
switch(VT.getSimpleVT().SimpleTy) {
case MVT::v4i32: return X86ISD::PUNPCKLDQ;
case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
- case MVT::v4f32:
- return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS;
- case MVT::v2f64:
- return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
+ case MVT::v4f32: return X86ISD::UNPCKLPS;
+ case MVT::v2f64: return X86ISD::UNPCKLPD;
+ case MVT::v8i32: // Use fp unit for int unpack.
case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+ case MVT::v4i64: // Use fp unit for int unpack.
case MVT::v4f64: return X86ISD::VUNPCKLPDY;
case MVT::v16i8: return X86ISD::PUNPCKLBW;
case MVT::v8i16: return X86ISD::PUNPCKLWD;
@@ -5653,6 +6407,10 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
case MVT::v4f32: return X86ISD::UNPCKHPS;
case MVT::v2f64: return X86ISD::UNPCKHPD;
+ case MVT::v8i32: // Use fp unit for int unpack.
+ case MVT::v8f32: return X86ISD::VUNPCKHPSY;
+ case MVT::v4i64: // Use fp unit for int unpack.
+ case MVT::v4f64: return X86ISD::VUNPCKHPDY;
case MVT::v16i8: return X86ISD::PUNPCKHBW;
case MVT::v8i16: return X86ISD::PUNPCKHWD;
default:
@@ -5661,6 +6419,68 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
return 0;
}
+static inline unsigned getVPERMILOpcode(EVT VT) {
+ switch(VT.getSimpleVT().SimpleTy) {
+ case MVT::v4i32:
+ case MVT::v4f32: return X86ISD::VPERMILPS;
+ case MVT::v2i64:
+ case MVT::v2f64: return X86ISD::VPERMILPD;
+ case MVT::v8i32:
+ case MVT::v8f32: return X86ISD::VPERMILPSY;
+ case MVT::v4i64:
+ case MVT::v4f64: return X86ISD::VPERMILPDY;
+ default:
+ llvm_unreachable("Unknown type for vpermil");
+ }
+ return 0;
+}
+
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+ EVT VT = Op.getValueType();
+ bool Is256 = VT.getSizeInBits() == 256;
+
+ assert((VT.getSizeInBits() == 128 || Is256) &&
+ "Unsupported type for vbroadcast node");
+
+ SDValue V = Op;
+ if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (Is256 && !(V.hasOneUse() &&
+ V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V.getOperand(0).getOpcode() == ISD::UNDEF))
+ return false;
+
+ if (Is256)
+ V = V.getOperand(1);
+
+ if (!V.hasOneUse())
+ return false;
+
+ // Check the source scalar_to_vector type. 256-bit broadcasts are
+ // supported for 32/64-bit sizes, while 128-bit ones are only supported
+ // for 32-bit scalars.
+ if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+
+ unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+ if (ScalarSize != 32 && ScalarSize != 64)
+ return false;
+ if (!Is256 && ScalarSize == 64)
+ return false;
+
+ V = V.getOperand(0);
+ if (!MayFoldLoad(V))
+ return false;
+
+ // Return the load node
+ Op = V;
+ return true;
+}
+
static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI,
@@ -5672,23 +6492,29 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
SDValue V2 = Op.getOperand(1);
if (isZeroShuffle(SVOp))
- return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
// Handle splat operations
if (SVOp->isSplat()) {
- // Special case, this is the only place now where it's
- // allowed to return a vector_shuffle operation without
- // using a target specific node, because *hopefully* it
- // will be optimized away by the dag combiner.
- if (VT.getVectorNumElements() <= 4 &&
- CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
+ unsigned NumElem = VT.getVectorNumElements();
+ int Size = VT.getSizeInBits();
+ // Special case, this is the only place now where it's allowed to return
+ // a vector_shuffle operation without using a target specific node, because
+ // *hopefully* it will be optimized away by the dag combiner. FIXME: should
+ // this be moved to DAGCombine instead?
+ if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
return Op;
- // Handle splats by matching through known masks
- if (VT.getVectorNumElements() <= 4)
+ // Use vbroadcast whenever the splat comes from a foldable load
+ if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+
+ // Handle splats by matching through known shuffle masks
+ if ((Size == 128 && NumElem <= 4) ||
+ (Size == 256 && NumElem < 8))
return SDValue();
- // Canonicalize all of the remaining to v4f32.
+ // All remaning splats are promoted to target supported vector shuffles.
return PromoteSplat(SVOp, DAG);
}
@@ -5698,7 +6524,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ } else if ((VT == MVT::v4i32 ||
+ (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
// FIXME: Figure out a cleaner way to do this.
// Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
@@ -5731,9 +6558,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
bool V1IsSplat = false;
bool V2IsSplat = false;
- bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
- bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
- bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
+ bool HasXMMInt = Subtarget->hasXMMInt();
MachineFunction &MF = DAG.getMachineFunction();
bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
@@ -5765,21 +6590,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
// unpckh_undef). Only use pshufd if speed is more important than size.
if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
- if (VT != MVT::v2i64 && VT != MVT::v2f64)
- return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG);
+ return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
- if (VT != MVT::v2i64 && VT != MVT::v2f64)
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
- if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
- RelaxedMayFoldVectorLoad(V1))
+ if (X86::isMOVDDUPMask(SVOp) &&
+ (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+ V2IsUndef && RelaxedMayFoldVectorLoad(V1))
return getMOVDDup(Op, dl, V1, DAG);
if (X86::isMOVHLPS_v_undef_Mask(SVOp))
return getMOVHighToLow(Op, dl, DAG);
// Use to match splats
- if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+ if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
(VT == MVT::v2f64 || VT == MVT::v2i64))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
@@ -5792,24 +6616,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
- if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+ if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
- TargetMask, DAG);
-
- if (VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
- TargetMask, DAG);
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+ TargetMask, DAG);
}
// Check if this can be converted into a logical shift.
bool isLeft = false;
unsigned ShAmt = 0;
SDValue ShVal;
- bool isShift = getSubtarget()->hasSSE2() &&
- isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ bool isShift = getSubtarget()->hasXMMInt() &&
+ isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
@@ -5824,7 +6643,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (ISD::isBuildVectorAllZeros(V1.getNode()))
return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
if (!X86::isMOVLPMask(SVOp)) {
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+ if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -5834,19 +6653,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// FIXME: fold these into legal mask.
if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
- return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+ return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
if (X86::isMOVHLPSMask(SVOp))
return getMOVHighToLow(Op, dl, DAG);
- if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+ if (X86::isMOVSHDUPMask(SVOp, Subtarget))
return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
- if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+ if (X86::isMOVSLDUPMask(SVOp, Subtarget))
return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
if (X86::isMOVLPMask(SVOp))
- return getMOVLP(Op, dl, DAG, HasSSE2);
+ return getMOVLP(Op, dl, DAG, HasXMMInt);
if (ShouldXformToMOVHLPS(SVOp) ||
ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -5887,8 +6706,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
}
if (X86::isUNPCKLMask(SVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
- dl, VT, V1, V2, DAG);
+ return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
if (X86::isUNPCKHMask(SVOp))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
@@ -5915,8 +6733,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
if (X86::isUNPCKLMask(NewSVOp))
- return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
- dl, VT, V2, V1, DAG);
+ return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
if (X86::isUNPCKHMask(NewSVOp))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
@@ -5932,18 +6749,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
SmallVector<int, 16> M;
SVOp->getMask(M);
- if (isPALIGNRMask(M, VT, HasSSSE3))
+ if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
X86::getShufflePALIGNRImmediate(SVOp),
DAG);
if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
SVOp->getSplatIndex() == 0 && V2IsUndef) {
- if (VT == MVT::v2f64) {
- X86ISD::NodeType Opcode =
- getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
- return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG);
- }
+ if (VT == MVT::v2f64)
+ return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
if (VT == MVT::v2i64)
return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
}
@@ -5958,23 +6772,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
X86::getShufflePSHUFLWImmediate(SVOp),
DAG);
- if (isSHUFPMask(M, VT)) {
- unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
- if (VT == MVT::v4f32 || VT == MVT::v4i32)
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
- TargetMask, DAG);
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
- TargetMask, DAG);
- }
+ if (isSHUFPMask(M, VT))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ X86::getShuffleSHUFImmediate(SVOp), DAG);
if (X86::isUNPCKL_v_undef_Mask(SVOp))
- if (VT != MVT::v2i64 && VT != MVT::v2f64)
- return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
- dl, VT, V1, V1, DAG);
+ return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
if (X86::isUNPCKH_v_undef_Mask(SVOp))
- if (VT != MVT::v2i64 && VT != MVT::v2f64)
- return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Generate target specific nodes for 128 or 256-bit shuffles only
+ // supported in the AVX instruction set.
+ //
+
+ // Handle VMOVDDUPY permutations
+ if (isMOVDDUPYMask(SVOp, Subtarget))
+ return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+ // Handle VPERMILPS* permutations
+ if (isVPERMILPSMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+ getShuffleVPERMILPSImmediate(SVOp), DAG);
+
+ // Handle VPERMILPD* permutations
+ if (isVPERMILPDMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+ getShuffleVPERMILPDImmediate(SVOp), DAG);
+
+ // Handle VPERM2F128 permutations
+ if (isVPERM2F128Mask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+ getShuffleVPERM2F128Immediate(SVOp), DAG);
+
+ // Handle VSHUFPSY permutations
+ if (isVSHUFPSYMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ getShuffleVSHUFPSYImmediate(SVOp), DAG);
+
+ // Handle VSHUFPDY permutations
+ if (isVSHUFPDYMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ getShuffleVSHUFPDYImmediate(SVOp), DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Since no target specific shuffle was selected for this generic one,
+ // lower it into other known shuffles. FIXME: this isn't true yet, but
+ // this is the plan.
+ //
// Handle v8i16 specifically since SSE can do byte extraction and insertion.
if (VT == MVT::v8i16) {
@@ -5989,9 +6834,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return NewOp;
}
- // Handle all 4 wide cases with a number of shuffles.
- if (NumElems == 4)
- return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+ // Handle all 128-bit wide vectors with 4 elements, and match them with
+ // several different shuffle types.
+ if (NumElems == 4 && VT.getSizeInBits() == 128)
+ return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+ // Handle general 256-bit shuffles
+ if (VT.is256BitVector())
+ return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
return SDValue();
}
@@ -6001,6 +6851,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
+
+ if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+ return SDValue();
+
if (VT.getSizeInBits() == 8) {
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
@@ -6060,36 +6914,26 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = Op.getOperand(0);
EVT VecVT = Vec.getValueType();
- // If this is a 256-bit vector result, first extract the 128-bit
- // vector and then extract from the 128-bit vector.
- if (VecVT.getSizeInBits() > 128) {
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.getSizeInBits() == 256) {
DebugLoc dl = Op.getNode()->getDebugLoc();
unsigned NumElems = VecVT.getVectorNumElements();
SDValue Idx = Op.getOperand(1);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
-
- unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
- bool Upper = IdxVal >= ExtractNumElems;
- Vec = Extract128BitVector(Vec, Idx, DAG, dl);
-
- // Extract from it.
- SDValue ScaledIdx = Idx;
- if (Upper)
- ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx,
- DAG.getConstant(ExtractNumElems,
- Idx.getValueType()));
+ bool Upper = IdxVal >= NumElems/2;
+ Vec = Extract128BitVector(Vec,
+ DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- ScaledIdx);
+ Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
}
assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
- if (Subtarget->hasSSE41()) {
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
if (Res.getNode())
return Res;
@@ -6120,7 +6964,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return Op;
// SHUFPS the element to the lowest double word, then movss.
- int Mask[4] = { Idx, -1, -1, -1 };
+ int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
EVT VVT = Op.getOperand(0).getValueType();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
@@ -6159,6 +7003,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
+ if (VT.getSizeInBits() == 256)
+ return SDValue();
+
if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
isa<ConstantSDNode>(N2)) {
unsigned Opc;
@@ -6206,35 +7053,28 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- // If this is a 256-bit vector result, first insert into a 128-bit
- // vector and then insert into the 256-bit vector.
- if (VT.getSizeInBits() > 128) {
+ // If this is a 256-bit vector result, first extract the 128-bit vector,
+ // insert the element into the extracted half and then place it back.
+ if (VT.getSizeInBits() == 256) {
if (!isa<ConstantSDNode>(N2))
return SDValue();
- // Get the 128-bit vector.
+ // Get the desired 128-bit vector half.
unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
- bool Upper = IdxVal >= NumElems / 2;
-
- SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl);
+ bool Upper = IdxVal >= NumElems/2;
+ SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
+ SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
- // Insert into it.
- SDValue ScaledN2 = N2;
- if (Upper)
- ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2,
- DAG.getConstant(NumElems /
- (VT.getSizeInBits() / 128),
- N2.getValueType()));
- Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0,
- N1, ScaledN2);
+ // Insert the element into the desired half.
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
+ N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
- // Insert the 128-bit vector
- // FIXME: Why UNDEF?
- return Insert128BitVector(N0, Op, N2, DAG, dl);
+ // Insert the changed part back to the 256-bit vector
+ return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
}
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX())
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
if (EltVT == MVT::i8)
@@ -6405,12 +7245,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
CodeModel::Model M = getTargetMachine().getCodeModel();
if (Subtarget->isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+ OpFlag = X86II::MO_GOTPCREL;
WrapperKind = X86ISD::WrapperRIP;
- else if (Subtarget->isPICStyleGOT())
- OpFlag = X86II::MO_GOTOFF;
- else if (Subtarget->isPICStyleStubPIC())
- OpFlag = X86II::MO_PIC_BASE_OFFSET;
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOT;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ } else if (Subtarget->isPICStyleStubNoDynamic()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY;
+ }
SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
@@ -6427,6 +7272,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
Result);
}
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlag))
+ Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(), false, false, 0);
+
return Result;
}
@@ -6676,7 +7527,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
// And our return value (tls address) is in the standard call return value
// location.
unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
- return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+ return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+ Chain.getValue(1));
}
assert(false &&
@@ -6922,9 +7774,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
// Load the 32-bit value into an XMM register.
SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Op.getOperand(0),
- DAG.getIntPtrConstant(0)));
+ Op.getOperand(0));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
+ DAG);
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7513,6 +8367,9 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
+
assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -7563,6 +8420,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
}
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+ "Unsupported value type for operation");
+
+ int NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC = Op.getOperand(2);
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond;
SDValue Op0 = Op.getOperand(0);
@@ -7575,11 +8465,21 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
if (isFP) {
unsigned SSECC = 8;
- EVT VT0 = Op0.getValueType();
- assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
- unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+ EVT EltVT = Op0.getValueType().getVectorElementType();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+
+ unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
bool Swap = false;
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
switch (SetCCOpcode) {
default: break;
case ISD::SETOEQ:
@@ -7624,6 +8524,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
}
+ // Break 256-bit integer vector compare into smaller ones.
+ if (!isFP && VT.getSizeInBits() == 256)
+ return Lower256IntVSETCC(Op, DAG);
+
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
@@ -7654,6 +8558,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Swap)
std::swap(Op0, Op1);
+ // Check that the operation in question is available (most are plain SSE2,
+ // but PCMPGTQ and PCMPEQQ have different requirements).
+ if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+ return SDValue();
+ if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+ return SDValue();
+
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
@@ -8014,9 +8925,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
- "This should be used only on Windows targets");
- assert(!Subtarget->isTargetEnvMacho());
+ assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+ EnableSegmentedStacks) &&
+ "This should be used only on Windows targets or when segmented stacks "
+ "are being used");
+ assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
DebugLoc dl = Op.getDebugLoc();
// Get the inputs.
@@ -8024,23 +8937,49 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue Size = Op.getOperand(1);
// FIXME: Ensure alignment here
- SDValue Flag;
+ bool Is64Bit = Subtarget->is64Bit();
+ EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
- unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+ if (EnableSegmentedStacks) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
- Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
- Flag = Chain.getValue(1);
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function *F = MF.getFunction();
+
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; I++)
+ if (I->hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+ unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ SDValue Ops1[2] = { Value, Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ } else {
+ SDValue Flag;
+ unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
- SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+ Flag = Chain.getValue(1);
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- Flag = Chain.getValue(1);
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+ Flag = Chain.getValue(1);
- Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+ Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
- SDValue Ops1[2] = { Chain.getValue(0), Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ SDValue Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ }
}
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -8118,7 +9057,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
EVT ArgVT = Op.getNode()->getValueType(0);
- const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
uint8_t ArgMode;
@@ -8292,6 +9231,19 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
DAG.getConstant(X86CC, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
+ // Arithmetic intrinsics.
+ case Intrinsic::x86_sse3_hadd_ps:
+ case Intrinsic::x86_sse3_hadd_pd:
+ case Intrinsic::x86_avx_hadd_ps_256:
+ case Intrinsic::x86_avx_hadd_pd_256:
+ return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::x86_sse3_hsub_ps:
+ case Intrinsic::x86_sse3_hsub_pd:
+ case Intrinsic::x86_avx_hsub_ps_256:
+ case Intrinsic::x86_avx_hsub_pd_256:
+ return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
@@ -8535,8 +9487,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
}
-SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue Root = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
@@ -8552,8 +9509,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
- const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
- const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+ const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
+ const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
@@ -8600,9 +9557,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
- SDValue Ops[] =
- { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -8619,7 +9574,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
NestReg = X86::ECX;
// Check that ECX wasn't needed by an 'inreg' parameter.
- const FunctionType *FTy = Func->getFunctionType();
+ FunctionType *FTy = Func->getFunctionType();
const AttrListPtr &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
@@ -8657,7 +9612,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
- const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+ const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
OutChains[0] = DAG.getStore(Root, dl,
DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr),
@@ -8682,9 +9637,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
- SDValue Ops[] =
- { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
}
}
@@ -8822,8 +9775,58 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ int NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType().getSizeInBits() == 256 &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType().getSizeInBits() == 256 &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.getSizeInBits() == 256)
+ return Lower256IntArith(Op, DAG);
+
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
DebugLoc dl = Op.getDebugLoc();
@@ -8872,11 +9875,51 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
-
LLVMContext *Context = DAG.getContext();
- // Must have SSE2.
- if (!Subtarget->hasSSE2()) return SDValue();
+ if (!Subtarget->hasXMMInt())
+ return SDValue();
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.getSizeInBits() == 256) {
+ int NumElems = VT.getVectorNumElements();
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Extract the two vectors
+ SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+
+ // Recreate the shift amount vectors
+ SDValue Amt1, Amt2;
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ // Constant shift amount
+ SmallVector<SDValue, 4> Amt1Csts;
+ SmallVector<SDValue, 4> Amt2Csts;
+ for (int i = 0; i < NumElems/2; ++i)
+ Amt1Csts.push_back(Amt->getOperand(i));
+ for (int i = NumElems/2; i < NumElems; ++i)
+ Amt2Csts.push_back(Amt->getOperand(i));
+
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt1Csts[0], NumElems/2);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt2Csts[0], NumElems/2);
+ } else {
+ // Variable shift amount
+ Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+ Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
+ // Issue new vector shifts for the smaller types
+ V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+ V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+ }
// Optimize shl/srl/sra with constant shift amount.
if (isSplatVector(Amt.getNode())) {
@@ -8927,9 +9970,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
}
// Lower SHL with variable shift amount.
- // Cannot lower SHL without SSE2 or later.
- if (!Subtarget->hasSSE2()) return SDValue();
-
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
@@ -8971,7 +10011,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(4, MVT::i32));
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
@@ -8986,13 +10026,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(2, MVT::i32));
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
// return pblendv(r, r+r, a);
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT,
- R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
+ R, DAG.getNode(ISD::ADD, dl, VT, R, R));
return R;
}
return SDValue();
@@ -9057,8 +10097,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(X86::COND_O, MVT::i32),
SDValue(Sum.getNode(), 2));
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
- return Sum;
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
}
@@ -9071,8 +10110,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(Cond, MVT::i32),
SDValue(Sum.getNode(), 1));
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
- return Sum;
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
@@ -9080,8 +10118,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
SDNode* Node = Op.getNode();
EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
EVT VT = Node->getValueType(0);
-
- if (Subtarget->hasSSE2() && VT.isVector()) {
+ if (Subtarget->hasXMMInt() && VT.isVector()) {
unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
ExtraVT.getScalarType().getSizeInBits();
SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
@@ -9091,11 +10128,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
switch (VT.getSimpleVT().SimpleTy) {
default:
return SDValue();
- case MVT::v2i64: {
- SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
- SRAIntrinsicsID = 0;
- break;
- }
case MVT::v4i32: {
SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
@@ -9115,12 +10147,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
// In case of 1 bit sext, no need to shr
if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
- if (SRAIntrinsicsID) {
- Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(SRAIntrinsicsID, MVT::i32),
- Tmp1, ShAmt);
- }
- return Tmp1;
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+ Tmp1, ShAmt);
}
return SDValue();
@@ -9132,7 +10161,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
// Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
// There isn't any reason to disable it if the target processor supports it.
- if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
+ if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
SDValue Chain = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = {
@@ -9172,6 +10201,45 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
}
+SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+ // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ // no-sse2). There isn't any reason to disable it if the target processor
+ // supports it.
+ if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res =
+ DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+ array_lengthof(Ops));
+ return SDValue(Res, 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
EVT T = Op.getValueType();
DebugLoc DL = Op.getDebugLoc();
@@ -9227,7 +10295,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
SelectionDAG &DAG) const {
EVT SrcVT = Op.getOperand(0).getValueType();
EVT DstVT = Op.getValueType();
- assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+ assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -9255,7 +10323,34 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
Node->getOperand(0),
Node->getOperand(1), negOp,
cast<AtomicSDNode>(Node)->getSrcValue(),
- cast<AtomicSDNode>(Node)->getAlignment());
+ cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: On 32-bit, store -> fist or movq would be more efficient
+ // (The only way to get a 16-byte store is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ return Swap.getValue(1);
+ }
+ // Other atomic stores have a simple pattern.
+ return Op;
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
@@ -9291,8 +10386,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
default: llvm_unreachable("Should not custom lower this!");
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -9318,7 +10415,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
- case ISD::VSETCC: return LowerVSETCC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
@@ -9332,11 +10428,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
- case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
- case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, DAG);
@@ -9352,15 +10449,38 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADD: return LowerADD(Op, DAG);
+ case ISD::SUB: return LowerSUB(Op, DAG);
}
}
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert wide load -> cmpxchg8b/cmpxchg16b
+ // FIXME: On 32-bit, load -> fild or movq would be more efficient
+ // (The only way to get a 16-byte load is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+ Node->getOperand(0),
+ Node->getOperand(1), Zero, Zero,
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ Results.push_back(Swap.getValue(0));
+ Results.push_back(Swap.getValue(1));
+}
+
void X86TargetLowering::
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG, unsigned NewOp) const {
- EVT T = Node->getValueType(0);
DebugLoc dl = Node->getDebugLoc();
- assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+ assert (Node->getValueType(0) == MVT::i64 &&
+ "Only know how to expand i64 atomics");
SDValue Chain = Node->getOperand(0);
SDValue In1 = Node->getOperand(1);
@@ -9423,37 +10543,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0);
- assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
- cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
- DAG.getConstant(0, MVT::i32));
- cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
- DAG.getConstant(1, MVT::i32));
- cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
- cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
- cpInL.getValue(1));
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH;
- swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
- DAG.getConstant(0, MVT::i32));
- swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
- DAG.getConstant(1, MVT::i32));
- swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
- cpInH.getValue(1));
- swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
- swapInL.getValue(1));
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, HalfT));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ swapInL, cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+ Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, swapInL.getValue(1));
SDValue Ops[] = { swapInH.getValue(0),
N->getOperand(1),
swapInH.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys,
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+ X86ISD::LCMPXCHG8_DAG;
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
Ops, 3, T, MMO);
- SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
- MVT::i32, Result.getValue(1));
- SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
- MVT::i32, cpOutL.getValue(2));
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
Results.push_back(cpOutH.getValue(1));
return;
}
@@ -9478,6 +10609,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_SWAP:
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
return;
+ case ISD::ATOMIC_LOAD:
+ ReplaceATOMIC_LOAD(N, Results, DAG);
}
}
@@ -9527,11 +10660,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PSIGNB: return "X86ISD::PSIGNB";
case X86ISD::PSIGNW: return "X86ISD::PSIGNW";
case X86ISD::PSIGND: return "X86ISD::PSIGND";
- case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::FHADD: return "X86ISD::FHADD";
+ case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
@@ -9570,6 +10704,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::ANDN: return "X86ISD::ANDN";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
@@ -9596,9 +10731,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::MOVSS: return "X86ISD::MOVSS";
case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS";
case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD";
- case X86ISD::VUNPCKLPS: return "X86ISD::VUNPCKLPS";
- case X86ISD::VUNPCKLPD: return "X86ISD::VUNPCKLPD";
- case X86ISD::VUNPCKLPSY: return "X86ISD::VUNPCKLPSY";
case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY";
case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS";
case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD";
@@ -9610,16 +10742,24 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS";
+ case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY";
+ case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD";
+ case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY";
+ case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
+ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
}
}
// isLegalAddressingMode - Return true if the addressing mode represented
// by AM is legal for this target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
- const Type *Ty) const {
+ Type *Ty) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -9671,7 +10811,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
}
-bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
@@ -9691,7 +10831,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
return true;
}
-bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
}
@@ -9715,7 +10855,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
EVT VT) const {
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSizeInBits() == 64)
- return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
+ return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
// FIXME: pshufb, blends, shifts.
return (VT.getVectorNumElements() == 2 ||
@@ -9725,7 +10865,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
isPSHUFDMask(M, VT) ||
isPSHUFHWMask(M, VT) ||
isPSHUFLWMask(M, VT) ||
- isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+ isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
isUNPCKLMask(M, VT) ||
isUNPCKHMask(M, VT) ||
isUNPCKL_v_undef_Mask(M, VT) ||
@@ -10158,7 +11298,9 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
- BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+ BuildMI(*BB, MI, dl,
+ TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
+ MI->getOperand(0).getReg())
.addReg(X86::XMM0);
MI->eraseFromParent();
@@ -10513,6 +11655,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MBB->addSuccessor(EndMBB);
}
+ unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -10521,7 +11664,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
/*Size=*/16, /*Align=*/16);
- BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
.addFrameIndex(RegSaveFrameIndex)
.addImm(/*Scale=*/1)
.addReg(/*IndexReg=*/0)
@@ -10565,17 +11708,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const MachineFunction *MF = BB->getParent();
- const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
- BitVector ReservedRegs = TRI->getReservedRegs(*MF);
-
- for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
- const MachineOperand &MO = MI->getOperand(I);
- if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
- unsigned Reg = MO.getReg();
- if (Reg != X86::EFLAGS) continue;
- copy0MBB->addLiveIn(Reg);
- sinkMBB->addLiveIn(Reg);
+ if (!MI->killsRegister(X86::EFLAGS)) {
+ copy0MBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
}
// Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -10611,6 +11746,119 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
}
MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+ bool Is64Bit) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(EnableSegmentedStacks);
+
+ unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+ unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI->getOperand(1).getReg(),
+ physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = BB;
+ ++MBBIter;
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB, llvm::next
+ (MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(tmpSPVReg);
+ BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(tmpSPVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(tmpSPVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space");
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(Is64Bit ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(mallocPtrVReg).addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI->eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
+MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -10718,11 +11966,11 @@ MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {
- default: assert(false && "Unexpected instr type to insert");
+ default: assert(0 && "Unexpected instr type to insert");
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
- assert(!"TAILJMP64 would not be touched here.");
+ assert(0 && "TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::TCRETURNmi64:
@@ -10745,6 +11993,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
return BB;
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ return EmitLoweredSegAlloca(MI, BB, false);
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB, true);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
@@ -10754,6 +12006,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::CMOV_V4F32:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
@@ -11074,6 +12329,33 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
Mask.getBitWidth() - 1);
break;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned NumLoBits = 0;
+ switch (IntId) {
+ default: break;
+ case Intrinsic::x86_sse_movmsk_ps:
+ case Intrinsic::x86_avx_movmsk_ps_256:
+ case Intrinsic::x86_sse2_movmsk_pd:
+ case Intrinsic::x86_avx_movmsk_pd_256:
+ case Intrinsic::x86_mmx_pmovmskb:
+ case Intrinsic::x86_sse2_pmovmskb_128: {
+ // High bits of movmskp{s|d}, pmovmskb are known zero.
+ switch (IntId) {
+ case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break;
+ case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break;
+ case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break;
+ case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break;
+ case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break;
+ }
+ KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
+ Mask.getBitWidth() - NumLoBits);
+ break;
+ }
+ }
+ break;
+ }
}
}
@@ -11102,23 +12384,132 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
-/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
-/// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ DebugLoc dl = N->getDebugLoc();
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+ V2.getOpcode() == ISD::CONCAT_VECTORS) {
+ //
+ // 0,0,0,...
+ // |
+ // V UNDEF BUILD_VECTOR UNDEF
+ // \ / \ /
+ // CONCAT_VECTOR CONCAT_VECTOR
+ // \ /
+ // \ /
+ // RESULT: V + zero extended
+ //
+ if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+ V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+ V1.getOperand(1).getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+ return SDValue();
+
+ // To match the shuffle mask, the first half of the mask should
+ // be exactly the first vector, and all the rest a splat with the
+ // first element of the second one.
+ for (int i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+ !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+ return SDValue();
+
+ // Emit a zeroed vector and insert the desired subvector on its
+ // first half.
+ SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+ SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // Combine some shuffles into subvector extracts and inserts:
+ //
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (isShuffleHigh128VectorInsertLow(SVOp)) {
+ SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+ V, DAG.getConstant(0, MVT::i32), DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (isShuffleLow128VectorInsertHigh(SVOp)) {
+ SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+ V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = N->getDebugLoc();
EVT VT = N->getValueType(0);
- if (VT.getSizeInBits() != 128)
- return SDValue();
-
// Don't create instructions with illegal types after legalize types has run.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
return SDValue();
+ // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+ if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE)
+ return PerformShuffleCombine256(N, DAG, DCI);
+
+ // Only handle 128 wide vector from here on.
+ if (VT.getSizeInBits() != 128)
+ return SDValue();
+
+ // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+ // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+ // consecutive, non-overlapping, and in the right order.
SmallVector<SDValue, 16> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
@@ -11209,7 +12600,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
@@ -11217,14 +12609,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Get the LHS/RHS of the select.
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
+ EVT VT = LHS.getValueType();
// If we have SSE[12] support, try to form min/max nodes. SSE min/max
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
- if (Subtarget->hasSSE2() &&
- (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
- Cond.getOpcode() == ISD::SETCC) {
+ if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+ VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ (Subtarget->hasXMMInt() ||
+ (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
unsigned Opcode = 0;
@@ -11267,7 +12661,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
if (!UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
break;
Opcode = X86ISD::FMAX;
break;
@@ -11680,7 +13074,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
// all elements are shifted by the same amount. We can't do this in legalize
// because the a constant vector is typically transformed to a constant pool
// so we have no knowledge of the shift amount.
- if (!Subtarget->hasSSE2())
+ if (!Subtarget->hasXMMInt())
return SDValue();
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
@@ -11796,7 +13190,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
- if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0->getOperand(1);
@@ -11864,6 +13258,36 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+ EVT VT = N->getValueType(0);
+
+ // Match direct AllOnes for 128 and 256-bit vectors
+ if (ISD::isBuildVectorAllOnes(N))
+ return true;
+
+ // Look through a bit convert.
+ if (N->getOpcode() == ISD::BITCAST)
+ N = N->getOperand(0).getNode();
+
+ // Sometimes the operand may come from a insert_subvector building a 256-bit
+ // allones vector
+ if (VT.getSizeInBits() == 256 &&
+ N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+ ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+ ISD::isBuildVectorAllOnes(V2.getNode()))
+ return true;
+ }
+
+ return false;
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -11874,11 +13298,28 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (R.getNode())
return R;
+ EVT VT = N->getValueType(0);
+
+ // Create ANDN instructions
+ if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ DebugLoc DL = N->getDebugLoc();
+
+ // Check LHS for not
+ if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
+ return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
+ // Check RHS for not
+ if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
+ return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
+
+ return SDValue();
+ }
+
// Want to form ANDNP nodes:
// 1) In the hopes of then easily combining them with OR and AND nodes
// to form PBLEND/PSIGN.
// 2) To match ANDN packed intrinsics
- EVT VT = N->getValueType(0);
if (VT != MVT::v2i64 && VT != MVT::v4i64)
return SDValue();
@@ -11888,12 +13329,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
// Check RHS for vnot
if (N1.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+ CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
return SDValue();
@@ -11917,7 +13360,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
// look for psign/blend
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
if (VT == MVT::v2i64) {
// Canonicalize pandn to RHS
if (N0.getOpcode() == X86ISD::ANDNP)
@@ -11990,13 +13433,13 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
}
}
// PBLENDVB only available on SSE 4.1
- if (!Subtarget->hasSSE41())
+ if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
return SDValue();
X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
- Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
}
}
@@ -12057,24 +13500,211 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ EVT RegVT = Ld->getValueType(0);
+ EVT MemVT = Ld->getMemoryVT();
+ DebugLoc dl = Ld->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ ISD::LoadExtType Ext = Ld->getExtensionType();
+
+ // If this is a vector EXT Load then attempt to optimize it using a
+ // shuffle. We need SSE4 for the shuffles.
+ // TODO: It is possible to support ZExt by zeroing the undef values
+ // during the shuffle phase or after the shuffle.
+ if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+ assert(MemVT != RegVT && "Cannot extend to the same type");
+ assert(MemVT.isVector() && "Must load a vector from memory");
+
+ unsigned NumElems = RegVT.getVectorNumElements();
+ unsigned RegSz = RegVT.getSizeInBits();
+ unsigned MemSz = MemVT.getSizeInBits();
+ assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ // All sizes must be a power of two
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
+
+ // Attempt to load the original value using a single load op.
+ // Find a scalar type which is equal to the loaded word size.
+ MVT SclrLoadTy = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
+ SclrLoadTy = Tp;
+ break;
+ }
+ }
+
+ // Proceed if a load word is found.
+ if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+
+ EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+ RegSz/SclrLoadTy.getSizeInBits());
+
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ RegSz/MemVT.getScalarType().getSizeInBits());
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->getAlignment());
+
+ // Insert the word loaded into a vector.
+ SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ LoadUnitVecVT, ScalarLoad);
+
+ // Bitcast the loaded value to a vector of the original element type, in
+ // the size of the target vector type.
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+ unsigned SizeRatio = RegSz/MemSz;
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(SlicedVec.getValueType()),
+ ShuffleVec.data());
+
+ // Bitcast to the requested type.
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+ // Replace the original load with the new sequence
+ // and return the new chain.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
+ return SDValue(ScalarLoad.getNode(), 1);
+ }
+
+ return SDValue();
+}
+
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ EVT StVT = St->getMemoryVT();
+ DebugLoc dl = St->getDebugLoc();
+ SDValue StoredVal = St->getOperand(1);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we are saving a concatination of two XMM registers, perform two stores.
+ // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+ // 128-bit ones. If in the future the cost becomes only one memory access the
+ // first version would be better.
+ if (VT.getSizeInBits() == 256 &&
+ StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+ StoredVal.getNumOperands() == 2) {
+
+ SDValue Value0 = StoredVal.getOperand(0);
+ SDValue Value1 = StoredVal.getOperand(1);
+
+ SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+ SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type
+ if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVec.getValueType()),
+ ShuffleVec.data());
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i));
+ SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
+ }
+
+
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
// places to insert EMMS. This qualifies as a quick hack.
// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
- StoreSDNode *St = cast<StoreSDNode>(N);
- EVT VT = St->getValue().getValueType();
if (VT.getSizeInBits() != 64)
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
- && Subtarget->hasSSE2();
+ && Subtarget->hasXMMInt();
if ((VT.isVector() ||
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
@@ -12172,6 +13802,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS. A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector. For example, if
+/// A = < float a0, float a1, float a2, float a3 >
+/// and
+/// B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+ // Look for the following pattern: if
+ // A = < float a0, float a1, float a2, float a3 >
+ // B = < float b0, float b1, float b2, float b3 >
+ // and
+ // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+ // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+ // which is A horizontal-op B.
+
+ // At least one of the operands should be a vector shuffle.
+ if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+ RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ EVT VT = LHS.getValueType();
+ unsigned N = VT.getVectorNumElements();
+
+ // View LHS in the form
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // If LHS is not a shuffle then pretend it is the shuffle
+ // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+ // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+ // type VT.
+ SDValue A, B;
+ SmallVector<int, 8> LMask(N);
+ if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ A = LHS.getOperand(0);
+ if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ B = LHS.getOperand(1);
+ cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+ } else {
+ if (LHS.getOpcode() != ISD::UNDEF)
+ A = LHS;
+ for (unsigned i = 0; i != N; ++i)
+ LMask[i] = i;
+ }
+
+ // Likewise, view RHS in the form
+ // RHS = VECTOR_SHUFFLE C, D, RMask
+ SDValue C, D;
+ SmallVector<int, 8> RMask(N);
+ if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+ C = RHS.getOperand(0);
+ if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+ D = RHS.getOperand(1);
+ cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+ } else {
+ if (RHS.getOpcode() != ISD::UNDEF)
+ C = RHS;
+ for (unsigned i = 0; i != N; ++i)
+ RMask[i] = i;
+ }
+
+ // Check that the shuffles are both shuffling the same vectors.
+ if (!(A == C && B == D) && !(A == D && B == C))
+ return false;
+
+ // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+ if (!A.getNode() && !B.getNode())
+ return false;
+
+ // If A and B occur in reverse order in RHS, then "swap" them (which means
+ // rewriting the mask).
+ if (A != C)
+ for (unsigned i = 0; i != N; ++i) {
+ unsigned Idx = RMask[i];
+ if (Idx < N)
+ RMask[i] += N;
+ else if (Idx < 2*N)
+ RMask[i] -= N;
+ }
+
+ // At this point LHS and RHS are equivalent to
+ // LHS = VECTOR_SHUFFLE A, B, LMask
+ // RHS = VECTOR_SHUFFLE A, B, RMask
+ // Check that the masks correspond to performing a horizontal operation.
+ for (unsigned i = 0; i != N; ++i) {
+ unsigned LIdx = LMask[i], RIdx = RMask[i];
+
+ // Ignore any UNDEF components.
+ if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
+ || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+ continue;
+
+ // Check that successive elements are being operated on. If not, this is
+ // not a horizontal operation.
+ if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
+ !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+ return false;
+ }
+
+ LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+ RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+ return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal adds from adds of shuffles.
+ if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+ (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+ isHorizontalBinOp(LHS, RHS, true))
+ return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+ return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ // Try to synthesize horizontal subs from subs of shuffles.
+ if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+ (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+ isHorizontalBinOp(LHS, RHS, false))
+ return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+ return SDValue();
+}
+
/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
/// X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -12326,7 +14100,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
// (add Y, (setne X, 0)) -> sbb -1, Y
// (sub (sete X, 0), Y) -> sbb 0, Y
// (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
DebugLoc DL = N->getDebugLoc();
// Look through ZExts.
@@ -12362,6 +14136,31 @@ static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
}
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // X86 can't encode an immediate LHS of a sub. See if we can push the
+ // negation into a preceding instruction.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+ // If the RHS of the sub is a XOR with one use and a constant, invert the
+ // immediate. Then add one to the LHS of the sub so we can turn
+ // X-Y -> X+~Y+1, saving one register.
+ if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+ isa<ConstantSDNode>(Op1.getOperand(1))) {
+ APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+ EVT VT = Op0.getValueType();
+ SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+ Op1.getOperand(0),
+ DAG.getConstant(~XorC, VT));
+ return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+ DAG.getConstant(C->getAPIntValue()+1, VT));
+ }
+ }
+
+ return OptimizeConditionalInDecrement(N, DAG);
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12369,10 +14168,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
default: break;
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+ case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
- case ISD::ADD:
- case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG);
+ case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG);
+ case ISD::SUB: return PerformSubCombine(N, DAG);
case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
case ISD::SHL:
@@ -12380,8 +14180,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
+ case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
@@ -12398,14 +14201,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PUNPCKHQDQ:
case X86ISD::UNPCKHPS:
case X86ISD::UNPCKHPD:
+ case X86ISD::VUNPCKHPSY:
+ case X86ISD::VUNPCKHPDY:
case X86ISD::PUNPCKLBW:
case X86ISD::PUNPCKLWD:
case X86ISD::PUNPCKLDQ:
case X86ISD::PUNPCKLQDQ:
case X86ISD::UNPCKLPS:
case X86ISD::UNPCKLPD:
- case X86ISD::VUNPCKLPS:
- case X86ISD::VUNPCKLPD:
case X86ISD::VUNPCKLPSY:
case X86ISD::VUNPCKLPDY:
case X86ISD::MOVHLPS:
@@ -12415,7 +14218,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
+ case X86ISD::VPERM2F128:
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
}
return SDValue();
@@ -12551,7 +14359,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces[1] == "${0:q}")) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12572,7 +14380,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
AsmPieces[3] == "~{fpsr}") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12603,7 +14411,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
AsmPieces[3] == "~{fpsr}") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12629,7 +14437,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
SplitString(AsmPieces[2], Words, " \t,");
if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
Words[2] == "%edx") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12700,7 +14508,7 @@ TargetLowering::ConstraintWeight
// but allow it at the lowest weight.
if (CallOperandVal == NULL)
return CW_Default;
- const Type *type = CallOperandVal->getType();
+ Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
OpenPOWER on IntegriCloud