Vendor import of llvm trunk r154661:

http://llvm.org/svn/llvm-project/llvm/trunk@r154661
author: dim <dim@FreeBSD.org> 2012-04-14 13:54:10 +0000
committer: dim <dim@FreeBSD.org> 2012-04-14 13:54:10 +0000
commit: 1fc08f5e9ef733ef1ce6f363fecedc2260e78974 (patch)
tree: 19c69a04768629f2d440944b71cbe90adae0b615 /lib/Target/X86/X86ISelLowering.cpp
parent: 07637c87f826cdf411f0673595e9bc92ebd793f2 (diff)
download: FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.zip
FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.tar.gz
1 files changed, 3109 insertions, 2262 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7c8ce17..9b83aad 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "x86-isel"
+#include "X86ISelLowering.h"
 #include "X86.h"
 #include "X86InstrBuilder.h"
-#include "X86ISelLowering.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "Utils/X86ShuffleDecode.h"
@@ -35,25 +35,21 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/VectorExtras.h"
+#include "llvm/ADT/VariadicFunction.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <bitset>
 using namespace llvm;
-using namespace dwarf;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
@@ -61,17 +57,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
 
-static SDValue Insert128BitVector(SDValue Result,
-                                  SDValue Vec,
-                                  SDValue Idx,
-                                  SelectionDAG &DAG,
-                                  DebugLoc dl);
-
-static SDValue Extract128BitVector(SDValue Vec,
-                                   SDValue Idx,
-                                   SelectionDAG &DAG,
-                                   DebugLoc dl);
-
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
 /// simple subregister reference.  Idx is an index in the 128 bits we
@@ -169,8 +154,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   : TargetLowering(TM, createTLOF(TM)) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
-  X86ScalarSSEf64 = Subtarget->hasXMMInt();
-  X86ScalarSSEf32 = Subtarget->hasXMM();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
   X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
 
   RegInfo = TM.getRegisterInfo();
@@ -186,8 +171,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
+  // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
   if (Subtarget->is64Bit())
     setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->isAtom()) 
+    setSchedulingPreference(Sched::Hybrid);
   else
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -199,15 +187,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setLibcallName(RTLIB::SREM_I64, "_allrem");
     setLibcallName(RTLIB::UREM_I64, "_aullrem");
     setLibcallName(RTLIB::MUL_I64, "_allmul");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2");
     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C);
+
+    // The _ftol2 runtime function has an unusual calling conv, which
+    // is modeled by a special pseudo-instruction.
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
   }
 
   if (Subtarget->isTargetDarwin()) {
@@ -256,8 +247,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
-  } else if (!UseSoftFloat) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+  } else if (!TM.Options.UseSoftFloat) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
@@ -271,7 +262,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
-  if (!UseSoftFloat) {
+  if (!TM.Options.UseSoftFloat) {
     // SSE has no i16 to fp conversion, only i32
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
@@ -314,7 +305,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
-  } else if (!UseSoftFloat) {
+  } else if (!TM.Options.UseSoftFloat) {
     // Since AVX is a superset of SSE3, only check for SSE here.
     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
       // Expand FP_TO_UINT into a select.
@@ -327,6 +318,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
   }
 
+  if (isTargetFTOL()) {
+    // Use the _ftol2 runtime function, which has a pseudo-instruction
+    // to handle its weird calling convention.
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  }
+
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
@@ -379,10 +376,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
+  // Promote the i8 variants and force them on up to i32 which has a shorter
+  // encoding.
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
   if (Subtarget->hasBMI()) {
-    setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
   } else {
-    setOperationAction(ISD::CTTZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
     if (Subtarget->is64Bit())
@@ -390,13 +395,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   }
 
   if (Subtarget->hasLZCNT()) {
+    // When promoting the i8 variants, force them to i32 for a shorter
+    // encoding.
     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
   } else {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
-    if (Subtarget->is64Bit())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget->is64Bit()) {
       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
   }
 
   if (Subtarget->hasPOPCNT()) {
@@ -459,7 +478,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
   }
 
-  if (Subtarget->hasXMM())
+  if (Subtarget->hasSSE1())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
@@ -538,14 +557,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
-  else if (EnableSegmentedStacks)
+  else if (TM.Options.EnableSegmentedStacks)
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Expand);
 
-  if (!UseSoftFloat && X86ScalarSSEf64) {
+  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
@@ -577,7 +596,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // cases we handle.
     addLegalFPImmediate(APFloat(+0.0)); // xorpd
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
-  } else if (!UseSoftFloat && X86ScalarSSEf32) {
+  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
@@ -606,11 +625,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
-  } else if (!UseSoftFloat) {
+  } else if (!TM.Options.UseSoftFloat) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
@@ -621,7 +640,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
@@ -640,7 +659,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Long double always uses X87.
-  if (!UseSoftFloat) {
+  if (!TM.Options.UseSoftFloat) {
     addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -659,11 +678,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
     }
 
-    if (!UnsafeFPMath) {
+    if (!TM.Options.UnsafeFPMath) {
       setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
     }
 
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
   }
 
@@ -715,7 +739,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
@@ -749,7 +775,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
-  if (!UseSoftFloat && Subtarget->hasMMX()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
     addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
@@ -786,7 +812,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 
-  if (!UseSoftFloat && Subtarget->hasXMM()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
 
     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
@@ -803,7 +829,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
-  if (!UseSoftFloat && Subtarget->hasXMMInt()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
     addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
 
     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
@@ -909,7 +935,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   }
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSE41()) {
     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -924,10 +950,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    // Can turn SHL into an integer multiply.
-    setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
-    setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
-
     setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
     setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
@@ -948,30 +970,47 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 
+    // FIXME: these should be Legal but thats only for the case where
+    // the index is constant.  For now custom expand to deal with that.
     if (Subtarget->is64Bit()) {
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
     }
   }
 
-  if (Subtarget->hasXMMInt()) {
-    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
-    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+  if (Subtarget->hasSSE2()) {
     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
 
-    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
 
-    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
+
+    if (Subtarget->hasAVX2()) {
+      setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
+
+      setOperationAction(ISD::SHL,             MVT::v2i64, Legal);
+      setOperationAction(ISD::SHL,             MVT::v4i32, Legal);
+
+      setOperationAction(ISD::SRA,             MVT::v4i32, Legal);
+    } else {
+      setOperationAction(ISD::SRL,             MVT::v2i64, Custom);
+      setOperationAction(ISD::SRL,             MVT::v4i32, Custom);
+
+      setOperationAction(ISD::SHL,             MVT::v2i64, Custom);
+      setOperationAction(ISD::SHL,             MVT::v4i32, Custom);
+
+      setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
+    }
   }
 
-  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+  if (Subtarget->hasSSE42())
     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
-  if (!UseSoftFloat && Subtarget->hasAVX()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
     addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
     addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
     addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
@@ -1008,18 +1047,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
 
-    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
 
-    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
 
-    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
 
     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
@@ -1030,25 +1065,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,            MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+
+    if (Subtarget->hasAVX2()) {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SRL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SRL,             MVT::v8i32, Legal);
+
+      setOperationAction(ISD::SHL,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
+
+      setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
+    } else {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
+      // Don't lower v32i8 because there is no 128-bit byte mul
 
-    setOperationAction(ISD::ADD,               MVT::v4i64, Custom);
-    setOperationAction(ISD::ADD,               MVT::v8i32, Custom);
-    setOperationAction(ISD::ADD,               MVT::v16i16, Custom);
-    setOperationAction(ISD::ADD,               MVT::v32i8, Custom);
+      setOperationAction(ISD::SRL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SRL,             MVT::v8i32, Custom);
 
-    setOperationAction(ISD::SUB,               MVT::v4i64, Custom);
-    setOperationAction(ISD::SUB,               MVT::v8i32, Custom);
-    setOperationAction(ISD::SUB,               MVT::v16i16, Custom);
-    setOperationAction(ISD::SUB,               MVT::v32i8, Custom);
+      setOperationAction(ISD::SHL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SHL,             MVT::v8i32, Custom);
 
-    setOperationAction(ISD::MUL,               MVT::v4i64, Custom);
-    setOperationAction(ISD::MUL,               MVT::v8i32, Custom);
-    setOperationAction(ISD::MUL,               MVT::v16i16, Custom);
-    // Don't lower v32i8 because there is no 128-bit byte mul
+      setOperationAction(ISD::SRA,             MVT::v8i32, Custom);
+    }
 
     // Custom lower several nodes for 256-bit types.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -1099,7 +1169,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // of this type with custom code.
   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
          VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                       Custom);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -1137,7 +1208,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
@@ -1152,9 +1222,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
+  if (Subtarget->hasBMI())
+    setTargetDAGCombine(ISD::XOR);
 
   computeRegisterProperties();
 
@@ -1166,10 +1240,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  setPrefLoopAlignment(16);
+  setPrefLoopAlignment(4); // 2^4 bytes.
   benefitFromCodePlacementOpt = true;
 
-  setPrefFunctionAlignment(4);
+  setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
 
@@ -1219,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   }
 
   unsigned Align = 4;
-  if (Subtarget->hasXMM())
+  if (Subtarget->hasSSE1())
     getMaxByValAlign(Ty, Align);
   return Align;
 }
@@ -1230,7 +1304,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
 /// probably because the source does not need to be loaded. If
-/// 'NonScalarIntSafe' is true, that means it's safe to return a
+/// 'IsZeroVal' is true, that means it's safe to return a
 /// non-scalar-integer type, e.g. empty string source, constant, or loaded
 /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
 /// constant so it does not need to be loaded.
@@ -1239,31 +1313,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
 EVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        unsigned DstAlign, unsigned SrcAlign,
-                                       bool NonScalarIntSafe,
+                                       bool IsZeroVal,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
   // linux.  This is because the stack realignment code can't handle certain
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = MF.getFunction();
-  if (NonScalarIntSafe &&
+  if (IsZeroVal &&
       !F->hasFnAttr(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16))) &&
         Subtarget->getStackAlignment() >= 16) {
-      if (Subtarget->hasAVX() &&
-          Subtarget->getStackAlignment() >= 32)
-        return MVT::v8f32;
-      if (Subtarget->hasXMMInt())
+      if (Subtarget->getStackAlignment() >= 32) {
+        if (Subtarget->hasAVX2())
+          return MVT::v8i32;
+        if (Subtarget->hasAVX())
+          return MVT::v8f32;
+      }
+      if (Subtarget->hasSSE2())
         return MVT::v4i32;
-      if (Subtarget->hasXMM())
+      if (Subtarget->hasSSE1())
         return MVT::v4f32;
     } else if (!MemcpyStrSrc && Size >= 8 &&
                !Subtarget->is64Bit() &&
                Subtarget->getStackAlignment() >= 8 &&
-               Subtarget->hasXMMInt()) {
+               Subtarget->hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.
       return MVT::f64;
@@ -1428,14 +1505,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget->is64Bit() && !Subtarget->hasXMM())) {
+          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
     // llvm-gcc has never done it right and no one has noticed, so this
     // should be OK for now.
     if (ValVT == MVT::f64 &&
-        (Subtarget->is64Bit() && !Subtarget->hasXMMInt()))
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
       report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
@@ -1461,7 +1538,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
-          if (!Subtarget->hasXMMInt())
+          if (!Subtarget->hasSSE2())
             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
         }
       }
@@ -1501,15 +1578,21 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                      MVT::Other, &RetOps[0], RetOps.size());
 }
 
-bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
+  SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() != ISD::CopyToReg &&
-      Copy->getOpcode() != ISD::FP_EXTEND)
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
   bool HasRet = false;
@@ -1520,7 +1603,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
     HasRet = true;
   }
 
-  return HasRet;
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
 }
 
 EVT
@@ -1561,7 +1648,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
-        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) {
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
@@ -1651,7 +1738,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
     return false;
 
   CallSite CS(CI);
@@ -1664,7 +1751,8 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
 /// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+                                   bool GuaranteedTailCallOpt) {
   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
 }
 
@@ -1678,7 +1766,8 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
-  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
+                              getTargetMachine().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
 
@@ -1704,7 +1793,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
     return DAG.getLoad(ValVT, dl, Chain, FIN,
                        MachinePointerInfo::getFixedStack(FI),
-                       false, false, 0);
+                       false, false, false, 0);
   }
 }
 
@@ -1728,6 +1817,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
   bool Is64Bit = Subtarget->is64Bit();
+  bool IsWindows = Subtarget->isTargetWindows();
   bool IsWin64 = Subtarget->isTargetWin64();
 
   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
@@ -1758,7 +1848,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
-      TargetRegisterClass *RC = NULL;
+      const TargetRegisterClass *RC;
       if (RegVT == MVT::i32)
         RC = X86::GR32RegisterClass;
       else if (Is64Bit && RegVT == MVT::i64)
@@ -1807,7 +1897,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     // If value is passed via pointer - do a load.
     if (VA.getLocInfo() == CCValAssign::Indirect)
       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
-                             MachinePointerInfo(), false, false, 0);
+                             MachinePointerInfo(), false, false, false, 0);
 
     InVals.push_back(ArgValue);
   }
@@ -1828,7 +1918,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // Align stack specially for tail calls.
-  if (FuncIsMadeTailCallSafe(CallConv))
+  if (FuncIsMadeTailCallSafe(CallConv,
+                             MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
@@ -1842,17 +1933,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
 
       // FIXME: We should really autogenerate these arrays
-      static const unsigned GPR64ArgRegsWin64[] = {
+      static const uint16_t GPR64ArgRegsWin64[] = {
         X86::RCX, X86::RDX, X86::R8,  X86::R9
       };
-      static const unsigned GPR64ArgRegs64Bit[] = {
+      static const uint16_t GPR64ArgRegs64Bit[] = {
         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
       };
-      static const unsigned XMMArgRegs64Bit[] = {
+      static const uint16_t XMMArgRegs64Bit[] = {
         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
       };
-      const unsigned *GPR64ArgRegs;
+      const uint16_t *GPR64ArgRegs;
       unsigned NumXMMRegs = 0;
 
       if (IsWin64) {
@@ -1865,17 +1956,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
         GPR64ArgRegs = GPR64ArgRegs64Bit;
 
-        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs);
+        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
+                                                TotalNumXMMRegs);
       }
       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
                                                        TotalNumIntRegs);
 
       bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
-      assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
              "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
+      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+               NoImplicitFloatOps) &&
              "SSE register cannot be used when SSE is disabled!");
-      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM())
+      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+          !Subtarget->hasSSE1())
         // Kernel mode asks for SSE to be disabled, so don't push them
         // on the stack.
         TotalNumXMMRegs = 0;
@@ -1892,8 +1986,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
       } else {
         // For X86-64, if there are vararg parameters that are passed via
-        // registers, then we must store them to their spots on the stack so they
-        // may be loaded by deferencing the result of va_next.
+        // registers, then we must store them to their spots on the stack so
+        // they may be loaded by deferencing the result of va_next.
         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
         FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
         FuncInfo->setRegSaveFrameIndex(
@@ -1953,12 +2047,14 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   }
 
   // Some CCs need callee pop.
-  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) {
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
+    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+        ArgsAreStructReturn(Ins))
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
@@ -2006,7 +2102,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 
   // Load the "old" Return address.
   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
-                           false, false, 0);
+                           false, false, false, 0);
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
@@ -2033,7 +2129,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
 SDValue
 X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                              CallingConv::ID CallConv, bool isVarArg,
-                             bool &isTailCall,
+                             bool doesNotRet, bool &isTailCall,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -2042,9 +2138,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isTargetWin64();
+  bool IsWindows      = Subtarget->isTargetWindows();
   bool IsStructRet    = CallIsStructReturn(Outs);
   bool IsSibcall      = false;
 
+  if (MF.getTarget().Options.DisableTailCalls)
+    isTailCall = false;
+
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
@@ -2053,7 +2153,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
-    if (!GuaranteedTailCallOpt && isTailCall)
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
@@ -2081,7 +2181,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
+  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+           IsTailCallConvention(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
@@ -2231,12 +2332,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
-    static const unsigned XMMArgRegs[] = {
+    static const uint16_t XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
-    assert((Subtarget->hasXMM() || !NumXMMRegs)
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
     Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
@@ -2260,7 +2361,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     int FI = 0;
     // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
-    if (GuaranteedTailCallOpt) {
+    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         if (VA.isRegLoc())
@@ -2368,7 +2469,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       if (ExtraLoad)
         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
                              MachinePointerInfo::getGOT(),
-                             false, false, 0);
+                             false, false, false, 0);
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
@@ -2421,6 +2522,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (Is64Bit && isVarArg && !IsWin64)
     Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
 
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
@@ -2440,12 +2547,15 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPush;
-  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt))
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       getTargetMachine().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+           IsStructRet)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
     NumBytesForCalleeToPush = 4;
   else
     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
@@ -2598,7 +2708,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
-  if (GuaranteedTailCallOpt) {
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -2641,9 +2751,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         return false;
   }
 
-  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
-  // Therefore if it's not used by the call it is not safe to optimize this into
-  // a sibcall.
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87
+  // stack.  Therefore, if it's not used by the call it is not safe to optimize
+  // this into a sibcall.
   bool Unused = false;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     if (!Ins[i].Used) {
@@ -2785,9 +2895,8 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
-  case X86ISD::SHUFPD:
+  case X86ISD::SHUFP:
   case X86ISD::PALIGN:
-  case X86ISD::SHUFPS:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVLHPD:
   case X86ISD::MOVHLPS:
@@ -2798,34 +2907,16 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
-  case X86ISD::VPERM2F128:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
     return true;
   }
-  return false;
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                                               SDValue V1, SelectionDAG &DAG) {
+                                    SDValue V1, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::MOVSHDUP:
@@ -2833,39 +2924,32 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVDDUP:
     return DAG.getNode(Opc, dl, VT, V1);
   }
-
-  return SDValue();
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                          SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERMILP:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
-
-  return SDValue();
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-               SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, SDValue V2, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::PALIGN:
-  case X86ISD::SHUFPD:
-  case X86ISD::SHUFPS:
-  case X86ISD::VPERM2F128:
+  case X86ISD::SHUFP:
+  case X86ISD::VPERM2X128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
                        DAG.getConstant(TargetMask, MVT::i8));
   }
-  return SDValue();
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2879,25 +2963,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVLPD:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
     return DAG.getNode(Opc, dl, VT, V1, V2);
   }
-  return SDValue();
 }
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
@@ -3092,17 +3161,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
-/// isUndefOrInRange - Return true if every element in Mask, begining
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// range (L, L+Pos]. or is undef.
-static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
-                             int Pos, int Size, int Low, int Hi) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i)
-    if (!isUndefOrInRange(Mask[i], Low, Hi))
-      return false;
-  return true;
-}
-
 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
 /// specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3114,7 +3172,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
 /// isSequentialOrUndefInRange - Return true if every element in Mask, begining
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (L, L+Pos]. or is undef.
-static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
                                        int Pos, int Size, int Low) {
   for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
     if (!isUndefOrEqual(Mask[i], Low))
@@ -3125,7 +3183,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
-static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3133,302 +3191,188 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   return false;
 }
 
-bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFDMask(M, N->getValueType(0));
-}
-
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
   if (VT != MVT::v8i16)
     return false;
 
   // Lower quadword copied in order or undef.
-  for (int i = 0; i != 4; ++i)
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return false;
+  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+    return false;
 
   // Upper quadword shuffled.
-  for (int i = 4; i != 8; ++i)
+  for (unsigned i = 4; i != 8; ++i)
     if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
       return false;
 
   return true;
 }
 
-bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFHWMask(M, N->getValueType(0));
-}
-
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
   if (VT != MVT::v8i16)
     return false;
 
   // Upper quadword copied in order.
-  for (int i = 4; i != 8; ++i)
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return false;
+  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+    return false;
 
   // Lower quadword shuffled.
-  for (int i = 0; i != 4; ++i)
+  for (unsigned i = 0; i != 4; ++i)
     if (Mask[i] >= 4)
       return false;
 
   return true;
 }
 
-bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFLWMask(M, N->getValueType(0));
-}
-
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3OrAVX) {
-  int i, e = VT.getVectorNumElements();
-  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
+      (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
     return false;
 
-  // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3OrAVX)
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  // Do not handle 64-bit element shuffles with palignr.
+  if (NumLaneElts == 2)
     return false;
 
-  for (i = 0; i != e; ++i)
-    if (Mask[i] >= 0)
-      break;
+  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+    unsigned i;
+    for (i = 0; i != NumLaneElts; ++i) {
+      if (Mask[i+l] >= 0)
+        break;
+    }
 
-  // All undef, not a palignr.
-  if (i == e)
-    return false;
+    // Lane is all undef, go to next lane
+    if (i == NumLaneElts)
+      continue;
 
-  // Make sure we're shifting in the right direction.
-  if (Mask[i] <= i)
-    return false;
+    int Start = Mask[i+l];
 
-  int s = Mask[i] - i;
+    // Make sure its in this lane in one of the sources
+    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+      return false;
 
-  // Check the rest of the elements to see if they are consecutive.
-  for (++i; i != e; ++i) {
-    int m = Mask[i];
-    if (m >= 0 && m != s+i)
+    // If not lane 0, then we must match lane 0
+    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
       return false;
-  }
-  return true;
-}
 
-/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPSY.
-static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          const X86Subtarget *Subtarget) {
-  int NumElems = VT.getVectorNumElements();
+    // Correct second source to be contiguous with first source
+    if (Start >= (int)NumElts)
+      Start -= NumElts - NumLaneElts;
 
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
-    return false;
+    // Make sure we're shifting in the right direction.
+    if (Start <= (int)(i+l))
+      return false;
 
-  if (NumElems != 8)
-    return false;
+    Start -= i;
 
-  // VSHUFPSY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
-  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
-  //
-  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
-  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
-  //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != NumLaneElts; ++i) {
+      int Idx = Mask[i+l];
 
-  // The mask of the second half must be the same as the first but with
-  // the appropriate offsets. This works in the same way as VPERMILPS
-  // works with masks.
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-  }
-  for (int i = QuarterSize*3; i < NumElems; ++i) {
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
+      // Make sure its in this lane
+      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+        return false;
+
+      // If not lane 0, then we must match lane 0
+      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+
+      if (Idx >= (int)NumElts)
+        Idx -= NumElts - NumLaneElts;
 
+      if (!isUndefOrEqual(Idx, Start+i))
+        return false;
+
+    }
   }
 
   return true;
 }
 
-/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
-static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
-
-  assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
-         "Only supports v8i32 and v8f32 types");
-
-  int HalfSize = NumElems/2;
-  unsigned Mask = 0;
-  for (int i = 0; i != NumElems ; ++i) {
-    if (SVOp->getMaskElt(i) < 0)
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+                                     unsigned NumElems) {
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
       continue;
-    // The mask of the first half must be equal to the second one.
-    unsigned Shamt = (i%HalfSize)*2;
-    unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
-    Mask |= Elt << Shamt;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
   }
-
-  return Mask;
 }
 
-/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
-/// version and the mask of the second half isn't binded with the first
-/// one.
-static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                           const X86Subtarget *Subtarget) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
+                        bool Commuted = false) {
+  if (!HasAVX && VT.getSizeInBits() == 256)
     return false;
 
-  if (NumElems != 4)
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElems = NumElems/NumLanes;
+
+  if (NumLaneElems != 2 && NumLaneElems != 4)
     return false;
 
   // VSHUFPSY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
   //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  // VSHUFPDY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
   //  SRC1 =>      X3       X2       X1       X0
   //  SRC2 =>      Y3       Y2       Y1       Y0
   //
-  //  DST  =>  Y2..Y3,  X2..X3,  Y1..Y0,  X1..X0
+  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-  for (int i = QuarterSize*3; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
-
-  return true;
-}
-
-/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
-static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
-
-  assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
-         "Only supports v4i64 and v4f64 types");
-
-  int HalfSize = NumElems/2;
-  unsigned Mask = 0;
-  for (int i = 0; i != NumElems ; ++i) {
-    if (SVOp->getMaskElt(i) < 0)
-      continue;
-    int Elt = SVOp->getMaskElt(i) % HalfSize;
-    Mask |= Elt << i;
+  unsigned HalfLaneElems = NumLaneElems/2;
+  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      int Idx = Mask[i+l];
+      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+        return false;
+      // For VSHUFPSY, the mask of the second half must be the same as the
+      // first but with the appropriate offsets. This works in the same way as
+      // VPERMILPS works with masks.
+      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+    }
   }
 
-  return Mask;
-}
-
-/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
-/// SHUFPS and SHUFPD.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
-    return false;
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
-      return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
-      return false;
-
-  return true;
-}
-
-bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isSHUFPMask(M, N->getValueType(0));
-}
-
-/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
-      return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
-      return false;
   return true;
 }
 
-static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return isCommutedSHUFPMask(M, N->getValueType(0));
-}
-
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
   unsigned NumElems = VT.getVectorNumElements();
 
   if (VT.getSizeInBits() != 128)
@@ -3438,17 +3382,16 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
     return false;
 
   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
-  return isUndefOrEqual(N->getMaskElt(0), 6) &&
-         isUndefOrEqual(N->getMaskElt(1), 7) &&
-         isUndefOrEqual(N->getMaskElt(2), 2) &&
-         isUndefOrEqual(N->getMaskElt(3), 3);
+  return isUndefOrEqual(Mask[0], 6) &&
+         isUndefOrEqual(Mask[1], 7) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
 }
 
 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
-bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
   unsigned NumElems = VT.getVectorNumElements();
 
   if (VT.getSizeInBits() != 128)
@@ -3457,26 +3400,29 @@ bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
   if (NumElems != 4)
     return false;
 
-  return isUndefOrEqual(N->getMaskElt(0), 2) &&
-         isUndefOrEqual(N->getMaskElt(1), 3) &&
-         isUndefOrEqual(N->getMaskElt(2), 2) &&
-         isUndefOrEqual(N->getMaskElt(3), 3);
+  return isUndefOrEqual(Mask[0], 2) &&
+         isUndefOrEqual(Mask[1], 3) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
 }
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
-  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (NumElems != 2 && NumElems != 4)
     return false;
 
-  for (unsigned i = 0; i < NumElems/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    if (!isUndefOrEqual(Mask[i], i + NumElems))
       return false;
 
-  for (unsigned i = NumElems/2; i < NumElems; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), i))
+  for (unsigned i = NumElems/2; i != NumElems; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
       return false;
 
   return true;
@@ -3484,19 +3430,19 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
 
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
-  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+  unsigned NumElems = VT.getVectorNumElements();
 
   if ((NumElems != 2 && NumElems != 4)
-      || N->getValueType(0).getSizeInBits() > 128)
+      || VT.getSizeInBits() > 128)
     return false;
 
-  for (unsigned i = 0; i < NumElems/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), i))
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
       return false;
 
-  for (unsigned i = 0; i < NumElems/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
       return false;
 
   return true;
@@ -3504,14 +3450,15 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                         bool V2IsSplat = false) {
-  int NumElts = VT.getVectorNumElements();
+static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+                         bool HasAVX2, bool V2IsSplat = false) {
+  unsigned NumElts = VT.getVectorNumElements();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3519,11 +3466,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  unsigned Start = 0;
-  unsigned End = NumLaneElts;
-  for (unsigned s = 0; s < NumLanes; ++s) {
-    for (unsigned i = Start, j = s * NumLaneElts;
-         i != End;
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
          i += 2, ++j) {
       int BitI  = Mask[i];
       int BitI1 = Mask[i+1];
@@ -3537,30 +3482,22 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
           return false;
       }
     }
-    // Process the next 128 bits.
-    Start += NumLaneElts;
-    End += NumLaneElts;
   }
 
   return true;
 }
 
-bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
-}
-
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                         bool V2IsSplat = false) {
-  int NumElts = VT.getVectorNumElements();
+static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+                         bool HasAVX2, bool V2IsSplat = false) {
+  unsigned NumElts = VT.getVectorNumElements();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3568,11 +3505,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  unsigned Start = 0;
-  unsigned End = NumLaneElts;
   for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
-                             i != End; i += 2, ++j) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
       int BitI  = Mask[i];
       int BitI1 = Mask[i+1];
       if (!isUndefOrEqual(BitI, j))
@@ -3585,42 +3520,39 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
           return false;
       }
     }
-    // Process the next 128 bits.
-    Start += NumLaneElts;
-    End += NumLaneElts;
   }
   return true;
 }
 
-bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
-}
-
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
+                                  bool HasAVX2) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
   // FIXME: Need a better way to get rid of this, there's no latency difference
   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   // the former later. We should also remove the "_undef" special mask.
-  if (NumElems == 4 && VT.getSizeInBits() == 256)
+  if (NumElts == 4 && VT.getSizeInBits() == 256)
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
   // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElems / NumLanes;
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
 
-  for (unsigned s = 0; s < NumLanes; ++s) {
-    for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
-         i != NumLaneElts * (s + 1);
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
          i += 2, ++j) {
       int BitI  = Mask[i];
       int BitI1 = Mask[i+1];
@@ -3635,81 +3567,77 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   return true;
 }
 
-bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
-}
-
 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
 /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
-  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j))
-      return false;
-    if (!isUndefOrEqual(BitI1, j))
-      return false;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
   }
   return true;
 }
 
-bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
-}
-
 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
   if (VT.getVectorElementType().getSizeInBits() < 32)
     return false;
+  if (VT.getSizeInBits() == 256)
+    return false;
 
-  int NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
 
   if (!isUndefOrEqual(Mask[0], NumElts))
     return false;
 
-  for (int i = 1; i < NumElts; ++i)
+  for (unsigned i = 1; i != NumElts; ++i)
     if (!isUndefOrEqual(Mask[i], i))
       return false;
 
   return true;
 }
 
-bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isMOVLMask(M, N->getValueType(0));
-}
-
-/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
 /// as permutations between 128-bit chunks or halves. As an example: this
 /// shuffle bellow:
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
-                             const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
+  if (!HasAVX || VT.getSizeInBits() != 256)
     return false;
 
   // The shuffle result is divided into half A and half B. In total the two
   // sources have 4 halves, namely: C, D, E, F. The final values of A and
   // B must come from C, D, E or F.
-  int HalfSize = VT.getVectorNumElements()/2;
+  unsigned HalfSize = VT.getVectorNumElements()/2;
   bool MatchA = false, MatchB = false;
 
   // Check if A comes from one of C, D, E, F.
-  for (int Half = 0; Half < 4; ++Half) {
+  for (unsigned Half = 0; Half != 4; ++Half) {
     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
       MatchA = true;
       break;
@@ -3717,7 +3645,7 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
   }
 
   // Check if B comes from one of C, D, E, F.
-  for (int Half = 0; Half < 4; ++Half) {
+  for (unsigned Half = 0; Half != 4; ++Half) {
     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
       MatchB = true;
       break;
@@ -3727,22 +3655,21 @@ static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
   return MatchA && MatchB;
 }
 
-/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
-static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   EVT VT = SVOp->getValueType(0);
 
-  int HalfSize = VT.getVectorNumElements()/2;
+  unsigned HalfSize = VT.getVectorNumElements()/2;
 
-  int FstHalf = 0, SndHalf = 0;
-  for (int i = 0; i < HalfSize; ++i) {
+  unsigned FstHalf = 0, SndHalf = 0;
+  for (unsigned i = 0; i < HalfSize; ++i) {
     if (SVOp->getMaskElt(i) > 0) {
       FstHalf = SVOp->getMaskElt(i)/HalfSize;
       break;
     }
   }
-  for (int i = HalfSize; i < HalfSize*2; ++i) {
+  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
     if (SVOp->getMaskElt(i) > 0) {
       SndHalf = SVOp->getMaskElt(i)/HalfSize;
       break;
@@ -3752,141 +3679,56 @@ static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
   return (FstHalf | (SndHalf << 4));
 }
 
-/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-
-  if (!Subtarget->hasAVX())
-    return false;
-
-  // Only match 256-bit with 64-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 4)
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
+  if (!HasAVX)
     return false;
 
-  // The mask on the high lane is independent of the low. Both can match
-  // any element in inside its own lane, but can't cross.
-  int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int LaneStart = l*LaneSize;
-      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
-        return false;
-    }
-
-  return true;
-}
-
-/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
-/// Note that VPERMIL mask matching is different depending whether theunderlying
-/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
-/// to the same elements of the low, but to the higher half of the source.
-/// In VPERMILPD the two lanes could be shuffled independently of each other
-/// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                            const X86Subtarget *Subtarget) {
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
-
-  if (!Subtarget->hasAVX())
+  // Only match 256-bit with 32/64-bit types
+  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
     return false;
 
-  // Only match 256-bit with 32-bit types
-  if (VT.getSizeInBits() != 256 || NumElts != 8)
-    return false;
-
-  // The mask on the high lane should be the same as the low. Actually,
-  // they can differ if any of the corresponding index in a lane is undef
-  // and the other stays in range.
-  int LaneSize = NumElts/NumLanes;
-  for (int i = 0; i < LaneSize; ++i) {
-    int HighElt = i+LaneSize;
-    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
-    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
-
-    if (!HighValid || !LowValid)
-      return false;
-    if (Mask[i] < 0 || Mask[HighElt] < 0)
-      continue;
-    if (Mask[HighElt]-Mask[i] != LaneSize)
-      return false;
-  }
-
-  return true;
-}
-
-/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
-static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-  int LaneSize = NumElts/NumLanes;
-
-  // Although the mask is equal for both lanes do it twice to get the cases
-  // where a mask will match because the same mask element is undef on the
-  // first half but valid on the second. This would get pathological cases
-  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
-  unsigned Mask = 0;
-  for (int l = 0; l < NumLanes; ++l) {
-    for (int i = 0; i < LaneSize; ++i) {
-      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
-      if (MaskElt < 0)
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned LaneSize = NumElts/NumLanes;
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (NumElts != 8 || l == 0)
         continue;
-      if (MaskElt >= LaneSize)
-        MaskElt -= LaneSize;
-      Mask |= MaskElt << (i*2);
-    }
-  }
-
-  return Mask;
-}
-
-/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
-static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VT = SVOp->getValueType(0);
-
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-
-  unsigned Mask = 0;
-  int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l < NumLanes; ++l)
-    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
-      int MaskElt = SVOp->getMaskElt(i);
-      if (MaskElt < 0)
+      // VPERMILPS handling
+      if (Mask[i] < 0)
         continue;
-      Mask |= (MaskElt-l*LaneSize) << i;
+      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
+        return false;
     }
+  }
 
-  return Mask;
+  return true;
 }
 
-/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
-  int NumOps = VT.getVectorNumElements();
+  unsigned NumOps = VT.getVectorNumElements();
+  if (VT.getSizeInBits() == 256)
+    return false;
   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
     return false;
 
   if (!isUndefOrEqual(Mask[0], 0))
     return false;
 
-  for (int i = 1; i < NumOps; ++i)
+  for (unsigned i = 1; i != NumOps; ++i)
     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
@@ -3895,26 +3737,14 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
   return true;
 }
 
-static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
-                           bool V2IsUndef = false) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
-}
-
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
-                         const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
-    return false;
-
-  // The second vector must be undef
-  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
     return false;
 
-  EVT VT = N->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
@@ -3922,9 +3752,9 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
     return false;
 
   // "i+1" is the value the indexed mask element must have
-  for (unsigned i = 0; i < NumElems; i += 2)
-    if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
-        !isUndefOrEqual(N->getMaskElt(i+1), i+1))
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i+1) ||
+        !isUndefOrEqual(Mask[i+1], i+1))
       return false;
 
   return true;
@@ -3933,16 +3763,11 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
-                         const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
-    return false;
-
-  // The second vector must be undef
-  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
     return false;
 
-  EVT VT = N->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
@@ -3950,9 +3775,9 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
     return false;
 
   // "i" is the value the indexed mask element must have
-  for (unsigned i = 0; i < NumElems; i += 2)
-    if (!isUndefOrEqual(N->getMaskElt(i), i) ||
-        !isUndefOrEqual(N->getMaskElt(i+1), i))
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i) ||
+        !isUndefOrEqual(Mask[i+1], i))
       return false;
 
   return true;
@@ -3961,21 +3786,17 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
-                           const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
-  int NumElts = VT.getVectorNumElements();
-  bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
-      !V2IsUndef || NumElts != 4)
+  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
     return false;
 
-  for (int i = 0; i != NumElts/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), 0))
+  for (unsigned i = 0; i != NumElts/2; ++i)
+    if (!isUndefOrEqual(Mask[i], 0))
       return false;
-  for (int i = NumElts/2; i != NumElts; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+  for (unsigned i = NumElts/2; i != NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], NumElts/2))
       return false;
   return true;
 }
@@ -3983,18 +3804,16 @@ static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
-bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
-
+static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
   if (VT.getSizeInBits() != 128)
     return false;
 
-  int e = VT.getVectorNumElements() / 2;
-  for (int i = 0; i < e; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), i))
+  unsigned e = VT.getVectorNumElements() / 2;
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
       return false;
-  for (int i = 0; i < e; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[e+i], i))
       return false;
   return true;
 }
@@ -4039,31 +3858,43 @@ bool X86::isVINSERTF128Index(SDNode *N) {
 
 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+/// Handles 128-bit and 256-bit.
+static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for PSHUF/SHUFP");
 
-  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+  // independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
+         "Only supports 2 or 4 elements per lane");
+
+  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
   unsigned Mask = 0;
-  for (int i = 0; i < NumOperands; ++i) {
-    int Val = SVOp->getMaskElt(NumOperands-i-1);
-    if (Val < 0) Val = 0;
-    if (Val >= NumOperands) Val -= NumOperands;
-    Mask |= Val;
-    if (i != NumOperands - 1)
-      Mask <<= Shift;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0) continue;
+    Elt %= NumLaneElts;
+    unsigned ShAmt = i << Shift;
+    if (ShAmt >= 8) ShAmt -= 8;
+    Mask |= Elt << ShAmt;
   }
+
   return Mask;
 }
 
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
-unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
   unsigned Mask = 0;
   // 8 nodes, but we only care about the last 4.
   for (unsigned i = 7; i >= 4; --i) {
-    int Val = SVOp->getMaskElt(i);
+    int Val = N->getMaskElt(i);
     if (Val >= 0)
       Mask |= (Val - 4);
     if (i != 4)
@@ -4074,12 +3905,11 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
 
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
-unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
   unsigned Mask = 0;
   // 8 nodes, but we only care about the first 4.
   for (int i = 3; i >= 0; --i) {
-    int Val = SVOp->getMaskElt(i);
+    int Val = N->getMaskElt(i);
     if (Val >= 0)
       Mask |= Val;
     if (i != 0)
@@ -4090,18 +3920,24 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
 
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VVT = N->getValueType(0);
-  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
-  int Val = 0;
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
 
-  unsigned i, e;
-  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+  int Val = 0;
+  unsigned i;
+  for (i = 0; i != NumElts; ++i) {
     Val = SVOp->getMaskElt(i);
     if (Val >= 0)
       break;
   }
+  if (Val >= (int)NumElts)
+    Val -= NumElts - NumLaneElts;
+
   assert(Val - i > 0 && "PALIGNR imm should be positive");
   return (Val - i) * EltSize;
 }
@@ -4170,36 +4006,20 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                               SVOp->getOperand(0), &MaskVec[0]);
 }
 
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int idx = Mask[i];
-    if (idx < 0)
-      continue;
-    else if (idx < (int)NumElems)
-      Mask[i] = idx + NumElems;
-    else
-      Mask[i] = idx - NumElems;
-  }
-}
-
 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
-  EVT VT = Op->getValueType(0);
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
   if (VT.getSizeInBits() != 128)
     return false;
   if (VT.getVectorNumElements() != 4)
     return false;
   for (unsigned i = 0, e = 2; i != e; ++i)
-    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+    if (!isUndefOrEqual(Mask[i], i+2))
       return false;
   for (unsigned i = 2; i != 4; ++i)
-    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+    if (!isUndefOrEqual(Mask[i], i+4))
       return false;
   return true;
 }
@@ -4218,14 +4038,36 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
   return true;
 }
 
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // Check for any non-constant elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    switch (N->getOperand(i).getNode()->getOpcode()) {
+    case ISD::UNDEF:
+    case ISD::ConstantFP:
+    case ISD::Constant:
+      break;
+    default:
+      return false;
+    }
+
+  // Vectors of all-zeros and all-ones are materialized with special
+  // instructions rather than being loaded.
+  return !ISD::isBuildVectorAllZeros(N) &&
+         !ISD::isBuildVectorAllOnes(N);
+}
+
 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
 /// match movlp{s|d}. The lower half elements should come from lower half of
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order). And since V1 will become the source of the
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
-                               ShuffleVectorSDNode *Op) {
-  EVT VT = Op->getValueType(0);
+                               ArrayRef<int> Mask, EVT VT) {
   if (VT.getSizeInBits() != 128)
     return false;
 
@@ -4233,7 +4075,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
     return false;
   // Is V2 is a vector load, don't do this transformation. We will try to use
   // load folding shufps op.
-  if (ISD::isNON_EXTLoad(V2))
+  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
     return false;
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -4241,10 +4083,10 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   if (NumElems != 2 && NumElems != 4)
     return false;
   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
-    if (!isUndefOrEqual(Op->getMaskElt(i), i))
+    if (!isUndefOrEqual(Mask[i], i))
       return false;
   for (unsigned i = NumElems/2; i != NumElems; ++i)
-    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+    if (!isUndefOrEqual(Mask[i], i+NumElems))
       return false;
   return true;
 }
@@ -4292,15 +4134,15 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
-                             DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
   if (VT.getSizeInBits() == 128) {  // SSE
-    if (HasXMMInt) {  // SSE2
+    if (Subtarget->hasSSE2()) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
     } else { // SSE1
@@ -4308,34 +4150,46 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
     }
   } else if (VT.getSizeInBits() == 256) { // AVX
-    // 256-bit logic and arithmetic instructions in AVX are
-    // all floating-point, no support for integer ops. Default
-    // to emitting fp zeroed vectors then.
-    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+    if (Subtarget->hasAVX2()) { // AVX2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else {
+      // 256-bit logic and arithmetic instructions in AVX are all
+      // floating-point, no support for integer ops. Emit fp zeroed vectors.
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+    }
   }
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
-/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
-/// original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
+                             DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   assert((VT.is128BitVector() || VT.is256BitVector())
          && "Expected a 128-bit or 256-bit vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
-  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                            Cst, Cst, Cst, Cst);
-
-  if (VT.is256BitVector()) {
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
-                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
-    Vec = Insert128BitVector(InsV, Vec,
-                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  SDValue Vec;
+  if (VT.getSizeInBits() == 256) {
+    if (HasAVX2) { // AVX2
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else { // AVX
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+      SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                                Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Vec = Insert128BitVector(InsV, Vec,
+                    DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+    }
+  } else {
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   }
 
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
@@ -4343,24 +4197,12 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
 
 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
 /// that point to V2 points to its first element.
-static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-
-  bool Changed = false;
-  SmallVector<int, 8> MaskVec;
-  SVOp->getMask(MaskVec);
-
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
   for (unsigned i = 0; i != NumElems; ++i) {
-    if (MaskVec[i] > (int)NumElems) {
-      MaskVec[i] = NumElems;
-      Changed = true;
+    if (Mask[i] > (int)NumElems) {
+      Mask[i] = NumElems;
     }
   }
-  if (Changed)
-    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
-                                SVOp->getOperand(1), &MaskVec[0]);
-  return SDValue(SVOp, 0);
 }
 
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
@@ -4464,7 +4306,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // Extract the 128-bit part containing the splat element and update
   // the splat element index when it refers to the higher register.
   if (Size == 256) {
-    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
     V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
     if (Idx > 0)
       EltNo -= NumElems/2;
@@ -4496,11 +4338,12 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
 /// element of V2 is swizzled into the zero/undef vector, landing at element
 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
-                                           bool isZero, bool HasXMMInt,
+                                           bool IsZero,
+                                           const X86Subtarget *Subtarget,
                                            SelectionDAG &DAG) {
   EVT VT = V2.getValueType();
-  SDValue V1 = isZero
-    ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
@@ -4509,9 +4352,81 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
 }
 
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, EVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD: {
+    // The index 0 always comes from the first element of the second source,
+    // this is why MOVSS and MOVSD are used in the first place. The other
+    // elements come from the other positions of the first source vector
+    Mask.push_back(NumElems);
+    for (unsigned i = 1; i != NumElems; ++i) {
+      Mask.push_back(i);
+    }
+    break;
+  }
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::PALIGN:
+    // Not yet implemented
+    return false;
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  return true;
+}
+
 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                    unsigned Depth) {
   if (Depth == 6)
     return SDValue();  // Limit search depth.
@@ -4522,129 +4437,34 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
-    Index = SV->getMaskElt(Index);
+    int Elt = SV->getMaskElt(Index);
 
-    if (Index < 0)
+    if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
-    int NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   }
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    int NumElems = VT.getVectorNumElements();
-    SmallVector<unsigned, 16> ShuffleMask;
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
     SDValue ImmN;
+    bool IsUnary;
 
-    switch(Opcode) {
-    case X86ISD::SHUFPS:
-    case X86ISD::SHUFPD:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeSHUFPSMask(NumElems,
-                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                       ShuffleMask);
-      break;
-    case X86ISD::PUNPCKHBW:
-    case X86ISD::PUNPCKHWD:
-    case X86ISD::PUNPCKHDQ:
-    case X86ISD::PUNPCKHQDQ:
-      DecodePUNPCKHMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::UNPCKHPS:
-    case X86ISD::UNPCKHPD:
-    case X86ISD::VUNPCKHPSY:
-    case X86ISD::VUNPCKHPDY:
-      DecodeUNPCKHPMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::PUNPCKLBW:
-    case X86ISD::PUNPCKLWD:
-    case X86ISD::PUNPCKLDQ:
-    case X86ISD::PUNPCKLQDQ:
-      DecodePUNPCKLMask(VT, ShuffleMask);
-      break;
-    case X86ISD::UNPCKLPS:
-    case X86ISD::UNPCKLPD:
-    case X86ISD::VUNPCKLPSY:
-    case X86ISD::VUNPCKLPDY:
-      DecodeUNPCKLPMask(VT, ShuffleMask);
-      break;
-    case X86ISD::MOVHLPS:
-      DecodeMOVHLPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::MOVLHPS:
-      DecodeMOVLHPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::PSHUFD:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFMask(NumElems,
-                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      ShuffleMask);
-      break;
-    case X86ISD::PSHUFHW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::PSHUFLW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::MOVSS:
-    case X86ISD::MOVSD: {
-      // The index 0 always comes from the first element of the second source,
-      // this is why MOVSS and MOVSD are used in the first place. The other
-      // elements come from the other positions of the first source vector.
-      unsigned OpNum = (Index == 0) ? 1 : 0;
-      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
-                                 Depth+1);
-    }
-    case X86ISD::VPERMILPS:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERMILPSY:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERMILPD:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERMILPDY:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::VPERM2F128:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                           ShuffleMask);
-      break;
-    case X86ISD::MOVDDUP:
-    case X86ISD::MOVLHPD:
-    case X86ISD::MOVLPD:
-    case X86ISD::MOVLPS:
-    case X86ISD::MOVSHDUP:
-    case X86ISD::MOVSLDUP:
-    case X86ISD::PALIGN:
-      return SDValue(); // Not yet implemented.
-    default:
-      assert(0 && "unknown target shuffle node");
+    if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
       return SDValue();
-    }
 
-    Index = ShuffleMask[Index];
-    if (Index < 0)
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
-    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+                                           : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
 
@@ -4660,7 +4480,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     return (Index == 0) ? V.getOperand(0)
-                          : DAG.getUNDEF(VT.getVectorElementType());
+                        : DAG.getUNDEF(VT.getVectorElementType());
 
   if (V.getOpcode() == ISD::BUILD_VECTOR)
     return V.getOperand(Index);
@@ -4672,38 +4492,37 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 /// shuffle operation which come from a consecutively from a zero. The
 /// search can start in two different directions, from left or right.
 static
-unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
                                   bool ZerosFromLeft, SelectionDAG &DAG) {
-  int i = 0;
-
-  while (i < NumElems) {
+  unsigned i;
+  for (i = 0; i != NumElems; ++i) {
     unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
-    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
+    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
     if (!(Elt.getNode() &&
          (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
       break;
-    ++i;
   }
 
   return i;
 }
 
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
-/// MaskE correspond consecutively to elements from one of the vector operands,
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
 static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
-                              int OpIdx, int NumElems, unsigned &OpNum) {
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+                              unsigned NumElems, unsigned &OpNum) {
   bool SeenV1 = false;
   bool SeenV2 = false;
 
-  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
+  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
     int Idx = SVOp->getMaskElt(i);
     // Ignore undef indicies
     if (Idx < 0)
       continue;
 
-    if (Idx < NumElems)
+    if (Idx < (int)NumElems)
       SeenV1 = true;
     else
       SeenV2 = true;
@@ -4738,7 +4557,7 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   //
   if (!isShuffleMaskConsecutive(SVOp,
             0,                   // Mask Start Index
-            NumElems-NumZeros-1, // Mask End Index
+            NumElems-NumZeros,   // Mask End Index(exclusive)
             NumZeros,            // Where to start looking in the src vector
             NumElems,            // Number of elements in vector
             OpSrc))              // Which source operand ?
@@ -4771,7 +4590,7 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   //
   if (!isShuffleMaskConsecutive(SVOp,
             NumZeros,     // Mask Start Index
-            NumElems-1,   // Mask End Index
+            NumElems,     // Mask End Index(exclusive)
             0,            // Where to start looking in the src vector
             NumElems,     // Number of elements in vector
             OpSrc))       // Which source operand ?
@@ -4804,6 +4623,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                        unsigned NumNonZero, unsigned NumZero,
                                        SelectionDAG &DAG,
+                                       const X86Subtarget* Subtarget,
                                        const TargetLowering &TLI) {
   if (NumNonZero > 8)
     return SDValue();
@@ -4815,7 +4635,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
     if (ThisIsNonZero && First) {
       if (NumZero)
-        V = getZeroVector(MVT::v8i16, true, DAG, dl);
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
       else
         V = DAG.getUNDEF(MVT::v8i16);
       First = false;
@@ -4851,6 +4671,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
+                                     const X86Subtarget* Subtarget,
                                      const TargetLowering &TLI) {
   if (NumNonZero > 4)
     return SDValue();
@@ -4863,7 +4684,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
     if (isNonZero) {
       if (First) {
         if (NumZero)
-          V = getZeroVector(MVT::v8i16, true, DAG, dl);
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
         else
           V = DAG.getUNDEF(MVT::v8i16);
         First = false;
@@ -4884,7 +4705,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          const TargetLowering &TLI, DebugLoc dl) {
   assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
-  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
   return DAG.getNode(ISD::BITCAST, dl, VT,
                      DAG.getNode(Opc, dl, ShVT, SrcOp,
@@ -4952,21 +4773,16 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
     int EltNo = (Offset - StartOffset) >> 2;
     int NumElems = VT.getVectorNumElements();
 
-    EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                              LD->getPointerInfo().getWithOffset(StartOffset),
-                             false, false, 0);
+                             false, false, false, 0);
 
-    // Canonicalize it to a v4i32 or v8i32 shuffle.
     SmallVector<int, 8> Mask;
     for (int i = 0; i < NumElems; ++i)
       Mask.push_back(EltNo);
 
-    V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
-    return DAG.getNode(ISD::BITCAST, dl, NVT,
-                       DAG.getVectorShuffle(CanonVT, dl, V1,
-                                            DAG.getUNDEF(CanonVT),&Mask[0]));
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   }
 
   return SDValue();
@@ -5021,11 +4837,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
       return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                          LDBase->getPointerInfo(),
-                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+                         LDBase->isVolatile(), LDBase->isNonTemporal(),
+                         LDBase->isInvariant(), 0);
     return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                        LDBase->getPointerInfo(),
                        LDBase->isVolatile(), LDBase->isNonTemporal(),
-                       LDBase->getAlignment());
+                       LDBase->isInvariant(), LDBase->getAlignment());
   } else if (NumElems == 4 && LastLoadedElt == 1 &&
              DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
@@ -5041,6 +4858,137 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   return SDValue();
 }
 
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+SDValue
+X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue Ld;
+  bool ConstSplatVal;
+
+  switch (Op.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      // The BUILD_VECTOR node must be a splat.
+      if (!isSplatVector(Op.getNode()))
+        return SDValue();
+
+      Ld = Op.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                     Ld.getOpcode() == ISD::ConstantFP);
+
+      // The suspected load node has several users. Make sure that all
+      // of its users are from the BUILD_VECTOR node.
+      // Constants may have multiple users.
+      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+        return SDValue();
+      break;
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
+        return SDValue();
+
+      Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      // Constants may have multiple users.
+      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+        return SDValue();
+      break;
+    }
+  }
+
+  bool Is256 = VT.getSizeInBits() == 256;
+  bool Is128 = VT.getSizeInBits() == 128;
+
+  // Handle the broadcasting a single constant scalar from the constant pool
+  // into a vector. On Sandybridge it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  if (ConstSplatVal && Subtarget->hasAVX2()) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+    unsigned ScalarSize = CVT.getSizeInBits();
+
+    if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
+        (Is128 && (ScalarSize == 32))) {
+
+      const Constant *C = 0;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
+  // The scalar source must be a normal load.
+  if (!ISD::isNormalLoad(Ld.getNode()))
+    return SDValue();
+
+  // Reject loads that have uses of the chain result
+  if (Ld->hasAnyUseOfValue(1))
+    return SDValue();
+
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+  // VBroadcast to YMM
+  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // VBroadcast to XMM
+  if (Is128 && (ScalarSize == 32))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is vbroadcastsd xmm
+  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
+    // VBroadcast to YMM
+    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+    // VBroadcast to XMM
+    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  }
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
@@ -5053,22 +5001,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32 ||
-        Op.getValueType() == MVT::v8i32)
+    if (VT == MVT::v4i32 || VT == MVT::v8i32)
       return Op;
 
-    return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, dl);
   }
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
-  // vectors or broken into v4i32 operations on 256-bit vectors.
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (Op.getValueType() == MVT::v4i32)
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
       return Op;
 
-    return getOnesVector(Op.getValueType(), DAG, dl);
+    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
   }
 
+  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  if (Broadcast.getNode())
+    return Broadcast;
+
   unsigned EVTBits = ExtVT.getSizeInBits();
 
   unsigned NumZero  = 0;
@@ -5118,8 +5070,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasXMMInt(), DAG);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
 
         // Now we have our 32-bit value zero extended in the low element of
         // a vector.  If Idx != 0, swizzle it into place.
@@ -5132,7 +5083,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                       DAG.getUNDEF(Item.getValueType()),
                                       &Mask[0]);
         }
-        return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
       }
     }
 
@@ -5141,21 +5092,33 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
     // depending on what the source datatype is.
     if (Idx == 0) {
-      if (NumZero == 0) {
+      if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+
+      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+        if (VT.getSizeInBits() == 256) {
+          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0));
+        }
+        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-        return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
-                                           DAG);
-      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+      }
+
+      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
-        EVT MiddleVT = MVT::v4i32;
-        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
-        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasXMMInt(), DAG);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+        if (VT.getSizeInBits() == 256) {
+          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
+                                    DAG, dl);
+        } else {
+          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+        }
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
       }
     }
@@ -5183,8 +5146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
-      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasXMMInt(), DAG);
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
       SmallVector<int, 8> MaskVec;
       for (unsigned i = 0; i < NumElems; i++)
         MaskVec.push_back(i == Idx ? 0 : 1);
@@ -5214,9 +5176,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+  if (VT.getSizeInBits() == 256) {
     SmallVector<SDValue, 32> V;
-    for (unsigned i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
       V.push_back(Op.getOperand(i));
 
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
@@ -5240,8 +5202,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       unsigned Idx = CountTrailingZeros_32(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                  Op.getOperand(Idx));
-      return getShuffleVectorZeroOrUndef(V2, Idx, true,
-                                         Subtarget->hasXMMInt(), DAG);
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
   }
@@ -5249,24 +5210,23 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16) {
     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                        *this);
+                                        Subtarget, *this);
     if (V.getNode()) return V;
   }
 
   if (EVTBits == 16 && NumElems == 8) {
     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
-                                      *this);
+                                      Subtarget, *this);
     if (V.getNode()) return V;
   }
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
-  SmallVector<SDValue, 8> V;
-  V.resize(NumElems);
+  SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1 << i));
       if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
@@ -5289,13 +5249,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       }
     }
 
-    SmallVector<int, 8> MaskVec;
-    bool Reverse = (NonZeros & 0x3) == 2;
-    for (unsigned i = 0; i < 2; ++i)
-      MaskVec.push_back(Reverse ? 1-i : i);
-    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
-    for (unsigned i = 0; i < 2; ++i)
-      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+    bool Reverse1 = (NonZeros & 0x3) == 2;
+    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    int MaskVec[] = {
+      Reverse1 ? 1 : 0,
+      Reverse1 ? 0 : 1,
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
+    };
     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
   }
 
@@ -5310,7 +5271,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return LD;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
+    if (getSubtarget()->hasSSE41()) {
       SDValue Result;
       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -5422,6 +5383,85 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
+// Try to lower a shuffle node into a simple blend instruction.
+static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT InVT = V1.getValueType();
+  int MaskSize = VT.getVectorNumElements();
+  int InSize = InVT.getVectorNumElements();
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (MaskSize != InSize)
+    return SDValue();
+
+  int ISDNo = 0;
+  MVT OpTy;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return SDValue();
+  case MVT::v8i16:
+           ISDNo = X86ISD::BLENDPW;
+           OpTy = MVT::v8i16;
+           break;
+  case MVT::v4i32:
+  case MVT::v4f32:
+           ISDNo = X86ISD::BLENDPS;
+           OpTy = MVT::v4f32;
+           break;
+  case MVT::v2i64:
+  case MVT::v2f64:
+           ISDNo = X86ISD::BLENDPD;
+           OpTy = MVT::v2f64;
+           break;
+  case MVT::v8i32:
+  case MVT::v8f32:
+           if (!Subtarget->hasAVX())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPS;
+           OpTy = MVT::v8f32;
+           break;
+  case MVT::v4i64:
+  case MVT::v4f64:
+           if (!Subtarget->hasAVX())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPD;
+           OpTy = MVT::v4f64;
+           break;
+  case MVT::v16i16:
+           if (!Subtarget->hasAVX2())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPW;
+           OpTy = MVT::v16i16;
+           break;
+  }
+  assert(ISDNo && "Invalid Op Number");
+
+  unsigned MaskVals = 0;
+
+  for (int i = 0; i < MaskSize; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if (EltIdx == i || EltIdx == -1)
+      MaskVals |= (1<<i);
+    else if (EltIdx == (i + MaskSize))
+      continue; // Bit is set to zero;
+    else return SDValue();
+  }
+
+  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
+  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
+                             DAG.getConstant(MaskVals, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@@ -5439,11 +5479,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // Determine if more than 1 of the words in each of the low and high quadwords
   // of the result come from the same quadword of one of the two inputs.  Undef
   // mask values count as coming from any quadword, for better codegen.
-  SmallVector<unsigned, 4> LoQuad(4);
-  SmallVector<unsigned, 4> HiQuad(4);
-  BitVector InputQuads(4);
+  unsigned LoQuad[] = { 0, 0, 0, 0 };
+  unsigned HiQuad[] = { 0, 0, 0, 0 };
+  std::bitset<4> InputQuads;
   for (unsigned i = 0; i < 8; ++i) {
-    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
     int EltIdx = SVOp->getMaskElt(i);
     MaskVals.push_back(EltIdx);
     if (EltIdx < 0) {
@@ -5481,10 +5521,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSSE3()) {
     if (InputQuads.count() == 2 && V1Used && V2Used) {
-      BestLoQuad = InputQuads.find_first();
-      BestHiQuad = InputQuads.find_next(BestLoQuad);
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
     }
     if (InputQuads.count() > 2) {
       BestLoQuad = -1;
@@ -5497,9 +5537,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // words from all 4 input quadwords.
   SDValue NewV;
   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
-    SmallVector<int, 8> MaskV;
-    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
-    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+    int MaskV[] = {
+      BestLoQuad < 0 ? 0 : BestLoQuad,
+      BestHiQuad < 0 ? 1 : BestHiQuad
+    };
     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
@@ -5544,8 +5585,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
       unsigned TargetMask = 0;
       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
-      TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
-                             X86::getShufflePSHUFLWImmediate(NewV.getNode());
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+                             getShufflePSHUFLWImmediate(SVOp);
       V1 = NewV.getOperand(0);
       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
     }
@@ -5554,7 +5596,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If we have elements from both input vectors, set the high bit of the
@@ -5602,61 +5644,51 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
 
   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
   // and update MaskVals with new element order.
-  BitVector InOrder(8);
+  std::bitset<8> InOrder;
   if (BestLoQuad >= 0) {
-    SmallVector<int, 8> MaskV;
+    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
     for (int i = 0; i != 4; ++i) {
       int idx = MaskVals[i];
       if (idx < 0) {
-        MaskV.push_back(-1);
         InOrder.set(i);
       } else if ((idx / 4) == BestLoQuad) {
-        MaskV.push_back(idx & 3);
+        MaskV[i] = idx & 3;
         InOrder.set(i);
-      } else {
-        MaskV.push_back(-1);
       }
     }
-    for (unsigned i = 4; i != 8; ++i)
-      MaskV.push_back(i);
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
-        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
-                               NewV.getOperand(0),
-                               X86::getShufflePSHUFLWImmediate(NewV.getNode()),
-                               DAG);
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFLWImmediate(SVOp), DAG);
+    }
   }
 
   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
   // and update MaskVals with the new element order.
   if (BestHiQuad >= 0) {
-    SmallVector<int, 8> MaskV;
-    for (unsigned i = 0; i != 4; ++i)
-      MaskV.push_back(i);
+    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
     for (unsigned i = 4; i != 8; ++i) {
       int idx = MaskVals[i];
       if (idx < 0) {
-        MaskV.push_back(-1);
         InOrder.set(i);
       } else if ((idx / 4) == BestHiQuad) {
-        MaskV.push_back((idx & 3) + 4);
+        MaskV[i] = (idx & 3) + 4;
         InOrder.set(i);
-      } else {
-        MaskV.push_back(-1);
       }
     }
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
-        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
-                              NewV.getOperand(0),
-                              X86::getShufflePSHUFHWImmediate(NewV.getNode()),
-                              DAG);
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFHWImmediate(SVOp), DAG);
+    }
   }
 
   // In case BestHi & BestLo were both -1, which means each quadword has a word
@@ -5698,8 +5730,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  SmallVector<int, 16> MaskVals;
-  SVOp->getMask(MaskVals);
+  ArrayRef<int> MaskVals = SVOp->getMask();
 
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
@@ -5718,7 +5749,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
+  if (TLI.getSubtarget()->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -5849,7 +5880,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
   EVT NewVT;
   switch (VT.getSimpleVT().SimpleTy) {
-  default: assert(false && "Unexpected!");
+  default: llvm_unreachable("Unexpected!");
   case MVT::v4f32: NewVT = MVT::v2f64; break;
   case MVT::v4i32: NewVT = MVT::v2i64; break;
   case MVT::v8i16: NewVT = MVT::v4i32; break;
@@ -5915,96 +5946,89 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                              OpVT, SrcOp)));
 }
 
-/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
-/// shuffle node referes to only one lane in the sources.
-static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
-  int HalfSize = NumElems/2;
-  SmallVector<int, 16> M;
-  SVOp->getMask(M);
-  bool MatchA = false, MatchB = false;
-
-  for (int l = 0; l < NumElems*2; l += HalfSize) {
-    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
-      MatchA = true;
-      break;
-    }
-  }
-
-  for (int l = 0; l < NumElems*2; l += HalfSize) {
-    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
-      MatchB = true;
-      break;
-    }
-  }
-
-  return MatchA && MatchB;
-}
-
 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
 /// which could not be matched by any known target speficic shuffle
 static SDValue
 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
-    // If each half of a vector shuffle node referes to only one lane in the
-    // source vectors, extract each used 128-bit lane and shuffle them using
-    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
-    // the work to the legalizer.
-    DebugLoc dl = SVOp->getDebugLoc();
-    EVT VT = SVOp->getValueType(0);
-    int NumElems = VT.getVectorNumElements();
-    int HalfSize = NumElems/2;
-
-    // Extract the reference for each half
-    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
-    int FstVecOpNum = 0, SndVecOpNum = 0;
-    for (int i = 0; i < HalfSize; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      if (SVOp->getMaskElt(i) < 0)
-        continue;
-      FstVecOpNum = Elt/NumElems;
-      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
-      break;
-    }
-    for (int i = HalfSize; i < NumElems; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      if (SVOp->getMaskElt(i) < 0)
+  EVT VT = SVOp->getValueType(0);
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLaneElems = NumElems / 2;
+
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Shufs[2];
+
+  SmallVector<int, 16> Mask;
+  for (unsigned l = 0; l < 2; ++l) {
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
+      int Idx = SVOp->getMaskElt(i+LaneStart);
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
         continue;
-      SndVecOpNum = Elt/NumElems;
-      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
-      break;
-    }
+      }
 
-    // Extract the subvectors
-    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
-                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
-                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
 
-    // Generate 128-bit shuffles
-    SmallVector<int, 16> MaskV1, MaskV2;
-    for (int i = 0; i < HalfSize; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
-    }
-    for (int i = HalfSize; i < NumElems; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used! Give up.
+        return SDValue();
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
     }
 
-    EVT NVT = V1.getValueType();
-    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
-    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+    if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Shufs[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                   DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
+                                   DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                   DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
+                                   DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+    }
 
-    // Concatenate the result back
-    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
-                                   DAG.getConstant(0, MVT::i32), DAG, dl);
-    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
-                              DAG, dl);
+    Mask.clear();
   }
 
-  return SDValue();
+  // Concatenate the result back
+  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
+                                 DAG.getConstant(0, MVT::i32), DAG, dl);
+  return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
+                            DAG, dl);
 }
 
 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6018,11 +6042,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
 
   assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
 
-  SmallVector<std::pair<int, int>, 8> Locs;
-  Locs.resize(4);
-  SmallVector<int, 8> Mask1(4U, -1);
-  SmallVector<int, 8> PermMask;
-  SVOp->getMask(PermMask);
+  std::pair<int, int> Locs[4];
+  int Mask1[] = { -1, -1, -1, -1 };
+  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
 
   unsigned NumHi = 0;
   unsigned NumLo = 0;
@@ -6052,17 +6074,14 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
     // vector operands, put the elements into the right order.
     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
 
-    SmallVector<int, 8> Mask2(4U, -1);
+    int Mask2[] = { -1, -1, -1, -1 };
 
-    for (unsigned i = 0; i != 4; ++i) {
-      if (Locs[i].first == -1)
-        continue;
-      else {
+    for (unsigned i = 0; i != 4; ++i)
+      if (Locs[i].first != -1) {
         unsigned Idx = (i < 2) ? 0 : 4;
         Idx += Locs[i].first * 2 + Locs[i].second;
         Mask2[i] = Idx;
       }
-    }
 
     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
   } else if (NumLo == 3 || NumHi == 3) {
@@ -6075,7 +6094,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
     // from X.
     if (NumHi == 3) {
       // Normalize it so the 3 elements come from V1.
-      CommuteVectorShuffleMask(PermMask, VT);
+      CommuteVectorShuffleMask(PermMask, 4);
       std::swap(V1, V2);
     }
 
@@ -6115,18 +6134,16 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   }
 
   // Break it into (shuffle shuffle_hi, shuffle_lo).
-  Locs.clear();
-  Locs.resize(4);
-  SmallVector<int,8> LoMask(4U, -1);
-  SmallVector<int,8> HiMask(4U, -1);
+  int LoMask[] = { -1, -1, -1, -1 };
+  int HiMask[] = { -1, -1, -1, -1 };
 
-  SmallVector<int,8> *MaskPtr = &LoMask;
+  int *MaskPtr = LoMask;
   unsigned MaskIdx = 0;
   unsigned LoIdx = 0;
   unsigned HiIdx = 2;
   for (unsigned i = 0; i != 4; ++i) {
     if (i == 2) {
-      MaskPtr = &HiMask;
+      MaskPtr = HiMask;
       MaskIdx = 1;
       LoIdx = 0;
       HiIdx = 2;
@@ -6136,26 +6153,21 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       Locs[i] = std::make_pair(-1, -1);
     } else if (Idx < 4) {
       Locs[i] = std::make_pair(MaskIdx, LoIdx);
-      (*MaskPtr)[LoIdx] = Idx;
+      MaskPtr[LoIdx] = Idx;
       LoIdx++;
     } else {
       Locs[i] = std::make_pair(MaskIdx, HiIdx);
-      (*MaskPtr)[HiIdx] = Idx;
+      MaskPtr[HiIdx] = Idx;
       HiIdx++;
     }
   }
 
   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
-  SmallVector<int, 8> MaskOps;
-  for (unsigned i = 0; i != 4; ++i) {
-    if (Locs[i].first == -1) {
-      MaskOps.push_back(-1);
-    } else {
-      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
-      MaskOps.push_back(Idx);
-    }
-  }
+  int MaskOps[] = { -1, -1, -1, -1 };
+  for (unsigned i = 0; i != 4; ++i)
+    if (Locs[i].first != -1)
+      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
 }
 
@@ -6164,6 +6176,10 @@ static bool MayFoldVectorLoad(SDValue V) {
     V = V.getOperand(0);
   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
   if (MayFoldLoad(V))
     return true;
   return false;
@@ -6186,82 +6202,6 @@ static bool RelaxedMayFoldVectorLoad(SDValue V) {
   return false;
 }
 
-/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
-/// a vector extract, and if both can be later optimized into a single load.
-/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
-/// here because otherwise a target specific shuffle node is going to be
-/// emitted for this shuffle, and the optimization not done.
-/// FIXME: This is probably not the best approach, but fix the problem
-/// until the right path is decided.
-static
-bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
-                                         const TargetLowering &TLI) {
-  EVT VT = V.getValueType();
-  ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);
-
-  // Be sure that the vector shuffle is present in a pattern like this:
-  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
-  if (!V.hasOneUse())
-    return false;
-
-  SDNode *N = *V.getNode()->use_begin();
-  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return false;
-
-  SDValue EltNo = N->getOperand(1);
-  if (!isa<ConstantSDNode>(EltNo))
-    return false;
-
-  // If the bit convert changed the number of elements, it is unsafe
-  // to examine the mask.
-  bool HasShuffleIntoBitcast = false;
-  if (V.getOpcode() == ISD::BITCAST) {
-    EVT SrcVT = V.getOperand(0).getValueType();
-    if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
-      return false;
-    V = V.getOperand(0);
-    HasShuffleIntoBitcast = true;
-  }
-
-  // Select the input vector, guarding against out of range extract vector.
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
-  V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
-
-  // Skip one more bit_convert if necessary
-  if (V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (ISD::isNormalLoad(V.getNode())) {
-    // Is the original load suitable?
-    LoadSDNode *LN0 = cast<LoadSDNode>(V);
-
-    // FIXME: avoid the multi-use bug that is preventing lots of
-    // of foldings to be detected, this is still wrong of course, but
-    // give the temporary desired behavior, and if it happens that
-    // the load has real more uses, during isel it will not fold, and
-    // will generate poor code.
-    if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
-      return false;
-
-    if (!HasShuffleIntoBitcast)
-      return true;
-
-    // If there's a bitcast before the shuffle, check if the load type and
-    // alignment is valid.
-    unsigned Align = LN0->getAlignment();
-    unsigned NewAlign =
-      TLI.getTargetData()->getABITypeAlignment(
-                                    VT.getTypeForEVT(*DAG.getContext()));
-
-    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
-      return false;
-  }
-
-  return true;
-}
-
 static
 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
@@ -6275,14 +6215,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
 
 static
 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
-                        bool HasXMMInt) {
+                        bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
-  if (HasXMMInt && VT == MVT::v2f64)
+  if (HasSSE2 && VT == MVT::v2f64)
     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
 
   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
@@ -6308,24 +6248,8 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
 }
 
-static inline unsigned getSHUFPOpcode(EVT VT) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v8i32: // Use fp unit for int unpack.
-  case MVT::v8f32:
-  case MVT::v4i32: // Use fp unit for int unpack.
-  case MVT::v4f32: return X86ISD::SHUFPS;
-  case MVT::v4i64: // Use fp unit for int unpack.
-  case MVT::v4f64:
-  case MVT::v2i64: // Use fp unit for int unpack.
-  case MVT::v2f64: return X86ISD::SHUFPD;
-  default:
-    llvm_unreachable("Unknown type for shufp*");
-  }
-  return 0;
-}
-
 static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
@@ -6346,32 +6270,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
   //    turns into:
   //  (MOVLPSmr addr:$src1, VR128:$src2)
   // So, recognize this potential and also use MOVLPS or MOVLPD
-  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
     CanFoldLoad = true;
 
-  // Both of them can't be memory operations though.
-  if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
-    CanFoldLoad = false;
-
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   if (CanFoldLoad) {
-    if (HasXMMInt && NumElems == 2)
+    if (HasSSE2 && NumElems == 2)
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
 
     if (NumElems == 4)
-      return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+      // If we don't care about the second element, procede to use movss.
+      if (SVOp->getMaskElt(1) != -1)
+        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
   }
 
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   // movl and movlp will both match v2i64, but v2i64 is never matched by
   // movl earlier because we make it strict to avoid messing with the movlp load
   // folding logic (see the code above getMOVLP call). Match it here then,
   // this is horrible, but will stay like this until we move all shuffle
   // matching to x86 specific nodes. Note that for the 1st condition all
   // types are matched with movsd.
-  if (HasXMMInt) {
+  if (HasSSE2) {
     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
     // as to remove this logic from here, as much as possible
-    if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
   }
@@ -6379,112 +6301,12 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
   assert(VT != MVT::v4i32 && "unsupported shuffle type");
 
   // Invert the operand order and use SHUFPS to match it.
-  return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
-                              X86::getShuffleSHUFImmediate(SVOp), DAG);
-}
-
-static inline unsigned getUNPCKLOpcode(EVT VT) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32: return X86ISD::PUNPCKLDQ;
-  case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
-  case MVT::v4f32: return X86ISD::UNPCKLPS;
-  case MVT::v2f64: return X86ISD::UNPCKLPD;
-  case MVT::v8i32: // Use fp unit for int unpack.
-  case MVT::v8f32: return X86ISD::VUNPCKLPSY;
-  case MVT::v4i64: // Use fp unit for int unpack.
-  case MVT::v4f64: return X86ISD::VUNPCKLPDY;
-  case MVT::v16i8: return X86ISD::PUNPCKLBW;
-  case MVT::v8i16: return X86ISD::PUNPCKLWD;
-  default:
-    llvm_unreachable("Unknown type for unpckl");
-  }
-  return 0;
-}
-
-static inline unsigned getUNPCKHOpcode(EVT VT) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32: return X86ISD::PUNPCKHDQ;
-  case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
-  case MVT::v4f32: return X86ISD::UNPCKHPS;
-  case MVT::v2f64: return X86ISD::UNPCKHPD;
-  case MVT::v8i32: // Use fp unit for int unpack.
-  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
-  case MVT::v4i64: // Use fp unit for int unpack.
-  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
-  case MVT::v16i8: return X86ISD::PUNPCKHBW;
-  case MVT::v8i16: return X86ISD::PUNPCKHWD;
-  default:
-    llvm_unreachable("Unknown type for unpckh");
-  }
-  return 0;
-}
-
-static inline unsigned getVPERMILOpcode(EVT VT) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v4i32:
-  case MVT::v4f32: return X86ISD::VPERMILPS;
-  case MVT::v2i64:
-  case MVT::v2f64: return X86ISD::VPERMILPD;
-  case MVT::v8i32:
-  case MVT::v8f32: return X86ISD::VPERMILPSY;
-  case MVT::v4i64:
-  case MVT::v4f64: return X86ISD::VPERMILPDY;
-  default:
-    llvm_unreachable("Unknown type for vpermil");
-  }
-  return 0;
-}
-
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
-/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
-static bool isVectorBroadcast(SDValue &Op) {
-  EVT VT = Op.getValueType();
-  bool Is256 = VT.getSizeInBits() == 256;
-
-  assert((VT.getSizeInBits() == 128 || Is256) &&
-         "Unsupported type for vbroadcast node");
-
-  SDValue V = Op;
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (Is256 && !(V.hasOneUse() &&
-                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-                 V.getOperand(0).getOpcode() == ISD::UNDEF))
-    return false;
-
-  if (Is256)
-    V = V.getOperand(1);
-
-  if (!V.hasOneUse())
-    return false;
-
-  // Check the source scalar_to_vector type. 256-bit broadcasts are
-  // supported for 32/64-bit sizes, while 128-bit ones are only supported
-  // for 32-bit scalars.
-  if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
-    return false;
-
-  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
-  if (ScalarSize != 32 && ScalarSize != 64)
-    return false;
-  if (!Is256 && ScalarSize == 64)
-    return false;
-
-  V = V.getOperand(0);
-  if (!MayFoldLoad(V))
-    return false;
-
-  // Return the load node
-  Op = V;
-  return true;
+  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+                              getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static
-SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
-                               const TargetLowering &TLI,
-                               const X86Subtarget *Subtarget) {
+SDValue
+X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
@@ -6492,22 +6314,17 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
   SDValue V2 = Op.getOperand(1);
 
   if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+    return getZeroVector(VT, Subtarget, DAG, dl);
 
   // Handle splat operations
   if (SVOp->isSplat()) {
     unsigned NumElem = VT.getVectorNumElements();
     int Size = VT.getSizeInBits();
-    // Special case, this is the only place now where it's allowed to return
-    // a vector_shuffle operation without using a target specific node, because
-    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
-    // this be moved to DAGCombine instead?
-    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
-      return Op;
 
     // Use vbroadcast whenever the splat comes from a foldable load
-    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
 
     // Handle splats by matching through known shuffle masks
     if ((Size == 128 && NumElem <= 4) ||
@@ -6525,21 +6342,26 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   } else if ((VT == MVT::v4i32 ||
-             (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
+             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
     // FIXME: Figure out a cleaner way to do this.
     // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
       if (NewOp.getNode()) {
-        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
-          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+        EVT NewVT = NewOp.getValueType();
+        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+                               NewVT, true, false))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
                               DAG, Subtarget, dl);
       }
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
-      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
-        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
-                            DAG, Subtarget, dl);
+      if (NewOp.getNode()) {
+        EVT NewVT = NewOp.getValueType();
+        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
+                              DAG, Subtarget, dl);
+      }
     }
   }
   return SDValue();
@@ -6553,18 +6375,22 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   unsigned NumElems = VT.getVectorNumElements();
-  bool isMMX = VT.getSizeInBits() == 64;
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
   bool V2IsSplat = false;
-  bool HasXMMInt = Subtarget->hasXMMInt();
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasAVX    = Subtarget->hasAVX();
+  bool HasAVX2   = Subtarget->hasAVX2();
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
-  // Shuffle operations on MMX not supported.
-  if (isMMX)
-    return Op;
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
 
   // Vector shuffle lowering takes 3 steps:
   //
@@ -6576,50 +6402,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   //    so the shuffle can be broken into other shuffles and the legalizer can
   //    try the lowering again.
   //
-  // The general ideia is that no vector_shuffle operation should be left to
+  // The general idea is that no vector_shuffle operation should be left to
   // be matched during isel, all of them must be converted to a target specific
   // node here.
 
   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   // narrowing and commutation of operands should be handled. The actual code
   // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget);
+  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
   if (NewOp.getNode())
     return NewOp;
 
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   // unpckh_undef). Only use pshufd if speed is more important than size.
-  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
-  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
-  if (X86::isMOVDDUPMask(SVOp) &&
-      (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
       V2IsUndef && RelaxedMayFoldVectorLoad(V1))
     return getMOVDDup(Op, dl, V1, DAG);
 
-  if (X86::isMOVHLPS_v_undef_Mask(SVOp))
+  if (isMOVHLPS_v_undef_Mask(M, VT))
     return getMOVHighToLow(Op, dl, DAG);
 
   // Use to match splats
-  if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+  if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
       (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
-  if (X86::isPSHUFDMask(SVOp)) {
+  if (isPSHUFDMask(M, VT)) {
     // The actual implementation will match the mask in the if above and then
     // during isel it can match several different instructions, not only pshufd
     // as its name says, sad but true, emulate the behavior for now...
-    if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
-        return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
 
-    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
+    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
 
-    if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
 
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
                                 TargetMask, DAG);
   }
 
@@ -6627,8 +6457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool isLeft = false;
   unsigned ShAmt = 0;
   SDValue ShVal;
-  bool isShift = getSubtarget()->hasXMMInt() &&
-                 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
@@ -6637,13 +6466,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   }
 
-  if (X86::isMOVLMask(SVOp)) {
-    if (V1IsUndef)
-      return V2;
+  if (isMOVLMask(M, VT)) {
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
-    if (!X86::isMOVLPMask(SVOp)) {
-      if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
+    if (!isMOVLPMask(M, VT)) {
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
 
       if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -6652,27 +6479,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // FIXME: fold these into legal mask.
-  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
-    return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
+  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
 
-  if (X86::isMOVHLPSMask(SVOp))
+  if (isMOVHLPSMask(M, VT))
     return getMOVHighToLow(Op, dl, DAG);
 
-  if (X86::isMOVSHDUPMask(SVOp, Subtarget))
+  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
 
-  if (X86::isMOVSLDUPMask(SVOp, Subtarget))
+  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
 
-  if (X86::isMOVLPMask(SVOp))
-    return getMOVLP(Op, dl, DAG, HasXMMInt);
+  if (isMOVLPMask(M, VT))
+    return getMOVLP(Op, dl, DAG, HasSSE2);
 
-  if (ShouldXformToMOVHLPS(SVOp) ||
-      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+  if (ShouldXformToMOVHLPS(M, VT) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
     return CommuteVectorShuffle(SVOp, DAG);
 
   if (isShift) {
-    // No better options. Use a vshl / vsrl.
+    // No better options. Use a vshldq / vsrldq.
     EVT EltVT = VT.getVectorElementType();
     ShAmt *= EltVT.getSizeInBits();
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
@@ -6685,17 +6512,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   V2IsSplat = isSplatVector(V2.getNode());
 
   // Canonicalize the splat or undef, if present, to be on the RHS.
-  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
-    Op = CommuteVectorShuffle(SVOp, DAG);
-    SVOp = cast<ShuffleVectorSDNode>(Op);
-    V1 = SVOp->getOperand(0);
-    V2 = SVOp->getOperand(1);
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
     std::swap(V1IsSplat, V2IsSplat);
-    std::swap(V1IsUndef, V2IsUndef);
     Commuted = true;
   }
 
-  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
     // Shuffling low element of v1 into undef, just return v1.
     if (V2IsUndef)
       return V1;
@@ -6705,81 +6529,77 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getMOVL(DAG, dl, VT, V2, V1);
   }
 
-  if (X86::isUNPCKLMask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+  if (isUNPCKLMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
 
-  if (X86::isUNPCKHMask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
+  if (isUNPCKHMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
 
   if (V2IsSplat) {
     // Normalize mask so all entries that point to V2 points to its first
     // element then try to match unpck{h|l} again. If match, return a
-    // new vector_shuffle with the corrected mask.
-    SDValue NewMask = NormalizeMask(SVOp, DAG);
-    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
-    if (NSVOp != SVOp) {
-      if (X86::isUNPCKLMask(NSVOp, true)) {
-        return NewMask;
-      } else if (X86::isUNPCKHMask(NSVOp, true)) {
-        return NewMask;
-      }
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
     }
   }
 
   if (Commuted) {
     // Commute is back and try unpck* again.
     // FIXME: this seems wrong.
-    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
-    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = false;
 
-    if (X86::isUNPCKLMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+    if (isUNPCKLMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
 
-    if (X86::isUNPCKHMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
+    if (isUNPCKHMask(M, VT, HasAVX2))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
   // inlined here right now to enable us to directly emit target specific
   // nodes, and remove one by one until they don't return Op anymore.
-  SmallVector<int, 16> M;
-  SVOp->getMask(M);
 
-  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+  if (isPALIGNRMask(M, VT, Subtarget))
     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
-                                X86::getShufflePALIGNRImmediate(SVOp),
+                                getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64)
-      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
-    if (VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
   }
 
   if (isPSHUFHWMask(M, VT))
     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
-                                X86::getShufflePSHUFHWImmediate(SVOp),
+                                getShufflePSHUFHWImmediate(SVOp),
                                 DAG);
 
   if (isPSHUFLWMask(M, VT))
     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
-                                X86::getShufflePSHUFLWImmediate(SVOp),
+                                getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT))
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
-                                X86::getShuffleSHUFImmediate(SVOp), DAG);
+  if (isSHUFPMask(M, VT, HasAVX))
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+                                getShuffleSHUFImmediate(SVOp), DAG);
 
-  if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
-  if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+  if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
 
   //===--------------------------------------------------------------------===//
   // Generate target specific nodes for 128 or 256-bit shuffles only
@@ -6787,33 +6607,26 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   //
 
   // Handle VMOVDDUPY permutations
-  if (isMOVDDUPYMask(SVOp, Subtarget))
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
-  // Handle VPERMILPS* permutations
-  if (isVPERMILPSMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPSImmediate(SVOp), DAG);
-
-  // Handle VPERMILPD* permutations
-  if (isVPERMILPDMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
-                                getShuffleVPERMILPDImmediate(SVOp), DAG);
-
-  // Handle VPERM2F128 permutations
-  if (isVPERM2F128Mask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
-                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT, HasAVX)) {
+    if (HasAVX2 && VT == MVT::v8i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  getShuffleSHUFImmediate(SVOp), DAG);
+    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+                                getShuffleSHUFImmediate(SVOp), DAG);
+  }
 
-  // Handle VSHUFPSY permutations
-  if (isVSHUFPSYMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
-                                getShuffleVSHUFPSYImmediate(SVOp), DAG);
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, HasAVX))
+    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
-  // Handle VSHUFPDY permutations
-  if (isVSHUFPDYMask(M, VT, Subtarget))
-    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
-                                getShuffleVSHUFPDYImmediate(SVOp), DAG);
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
 
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
@@ -6896,8 +6709,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                               Op.getOperand(0)),
                                               Op.getOperand(1));
     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  } else if (VT == MVT::i32) {
-    // ExtractPS works with constant index.
+  } else if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return Op;
   }
@@ -6933,7 +6746,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSE41()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
     if (Res.getNode())
       return Res;
@@ -7036,7 +6849,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
     // Create this as a scalar to vector..
     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+  } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 
+             isa<ConstantSDNode>(N2)) {
     // PINSR* works with constant index.
     return Op;
   }
@@ -7074,7 +6888,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
     return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   }
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
+  if (Subtarget->hasSSE41())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
 
   if (EltVT == MVT::i8)
@@ -7276,7 +7090,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   // load.
   if (isGlobalStubReference(OpFlag))
     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
 
   return Result;
 }
@@ -7344,7 +7158,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
   // load.
   if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
@@ -7423,7 +7237,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 
   SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                                       DAG.getIntPtrConstant(0),
-                                      MachinePointerInfo(Ptr), false, false, 0);
+                                      MachinePointerInfo(Ptr),
+                                      false, false, false, 0);
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -7449,7 +7264,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 
   if (model == TLSModel::InitialExec)
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
@@ -7471,8 +7286,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       GV = GA->resolveAliasedGlobal(false);
 
-    TLSModel::Model model
-      = getTLSModel(GV, getTargetMachine().getRelocationModel());
+    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
@@ -7529,19 +7343,77 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
                               Chain.getValue(1));
-  }
+  } else if (Subtarget->isTargetWindows()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
 
-  assert(false &&
-         "TLS not implemented for this target.");
+    // If GV is an alias then use the aliasee for determining
+    // thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GV = GA->resolveAliasedGlobal(false);
+    DebugLoc dl = GA->getDebugLoc();
+    SDValue Chain = DAG.getEntryNode();
 
-  llvm_unreachable("Unreachable");
-  return SDValue();
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit).
+    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
+
+    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain,
+                                        Subtarget->is64Bit()
+                                        ? DAG.getIntPtrConstant(0x58)
+                                        : DAG.getExternalSymbol("_tls_array",
+                                                                getPointerTy()),
+                                        MachinePointerInfo(Ptr),
+                                        false, false, false, 0);
+
+    // Load the _tls_index variable
+    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+    if (Subtarget->is64Bit())
+      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+                           IDX, MachinePointerInfo(), MVT::i32,
+                           false, false, 0);
+    else
+      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+                        false, false, false, 0);
+
+    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+		                            getPointerTy());
+    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+                      false, false, false, 0);
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+  }
+
+  llvm_unreachable("TLS not implemented for this target.");
 }
 
 
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
-/// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -7673,7 +7545,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                     Op.getValueType(), MMO);
     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                          MachinePointerInfo::getFixedStack(SSFI),
-                         false, false, 0);
+                         false, false, false, 0);
   }
 
   return Result;
@@ -7682,85 +7554,65 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                                SelectionDAG &DAG) const {
-  // This algorithm is not obvious. Here it is in C code, more or less:
+  // This algorithm is not obvious. Here it is what we're trying to output:
   /*
-    double uint64_to_double( uint32_t hi, uint32_t lo ) {
-      static const __m128i exp = { 0x4330000045300000ULL, 0 };
-      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
-
-      // Copy ints to xmm registers.
-      __m128i xh = _mm_cvtsi32_si128( hi );
-      __m128i xl = _mm_cvtsi32_si128( lo );
-
-      // Combine into low half of a single xmm register.
-      __m128i x = _mm_unpacklo_epi32( xh, xl );
-      __m128d d;
-      double sd;
-
-      // Merge in appropriate exponents to give the integer bits the right
-      // magnitude.
-      x = _mm_unpacklo_epi32( x, exp );
-
-      // Subtract away the biases to deal with the IEEE-754 double precision
-      // implicit 1.
-      d = _mm_sub_pd( (__m128d) x, bias );
-
-      // All conversions up to here are exact. The correctly rounded result is
-      // calculated using the current rounding mode using the following
-      // horizontal add.
-      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
-      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
-                                // store doesn't really need to be here (except
-                                // maybe to zero the other double)
-      return sd;
-    }
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0          
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1 
+       addpd    %xmm1, %xmm0
+     #endif
   */
 
   DebugLoc dl = Op.getDebugLoc();
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
-  std::vector<Constant*> CV0;
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
-  Constant *C0 = ConstantVector::get(CV0);
+  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
 
-  std::vector<Constant*> CV1;
+  SmallVector<Constant*,2> CV1;
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
 
-  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(1)));
-  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(0)));
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                               MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
-  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
+                              false, false, false, 16);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                               MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
 
-  // Add the halves; easiest way is to swap them into another reg first.
-  int ShufMask[2] = { 1, -1 };
-  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
-                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
-  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                      DAG.getIntPtrConstant(0));
 }
 
@@ -7777,8 +7629,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                              Op.getOperand(0));
 
   // Zero out the upper parts of the register.
-  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
-                                     DAG);
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7830,6 +7681,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     return LowerUINT_TO_FP_i64(Op, DAG);
   else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG);
+  else if (Subtarget->is64Bit() &&
+           SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
@@ -7849,7 +7703,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                StackSlot, MachinePointerInfo(),
+                               StackSlot, MachinePointerInfo(),
                                false, false, 0);
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
@@ -7897,19 +7751,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 }
 
 std::pair<SDValue,SDValue> X86TargetLowering::
-FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
   DebugLoc DL = Op.getDebugLoc();
 
   EVT DstTy = Op.getValueType();
 
-  if (!IsSigned) {
+  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
     DstTy = MVT::i64;
   }
 
   assert(DstTy.getSimpleVT() <= MVT::i64 &&
          DstTy.getSimpleVT() >= MVT::i16 &&
-         "Unknown FP_TO_SINT to lower!");
+         "Unknown FP_TO_INT to lower!");
 
   // These are really Legal.
   if (DstTy == MVT::i32 &&
@@ -7920,26 +7774,29 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
 
-  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
-  // stack slot.
+  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+  // stack slot, or into the FTOL runtime function.
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
 
-
-
   unsigned Opc;
-  switch (DstTy.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
-  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
-  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
-  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
-  }
+  if (!IsSigned && isIntegerTypeFTOL(DstTy))
+    Opc = X86ISD::WIN_FTOL;
+  else
+    switch (DstTy.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+    }
 
   SDValue Chain = DAG.getEntryNode();
   SDValue Value = Op.getOperand(0);
   EVT TheVT = Op.getOperand(0).getValueType();
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
@@ -7964,12 +7821,26 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                             MachineMemOperand::MOStore, MemSize, MemSize);
 
-  // Build the FP_TO_INT*_IN_MEM
-  SDValue Ops[] = { Chain, Value, StackSlot };
-  SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
-                                         Ops, 3, DstTy, MMO);
-
-  return std::make_pair(FIST, StackSlot);
+  if (Opc != X86ISD::WIN_FTOL) {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, 3, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  } else {
+    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+      DAG.getVTList(MVT::Other, MVT::Glue),
+      Chain, Value);
+    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+      MVT::i32, ftol.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+      MVT::i32, eax.getValue(2));
+    SDValue Ops[] = { eax, edx };
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
+      : DAG.getMergeValues(Ops, 2, DL);
+    return std::make_pair(pair, SDValue());
+  }
 }
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
@@ -7977,25 +7848,37 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
   if (Op.getValueType().isVector())
     return SDValue();
 
-  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ true, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   if (FIST.getNode() == 0) return Op;
 
-  // Load the result.
-  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+  else
+    // The node is the result.
+    return FIST;
 }
 
 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ false, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   assert(FIST.getNode() && "Unexpected failure");
 
-  // Load the result.
-  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
-                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+  else
+    // The node is the result.
+    return FIST;
 }
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -8006,23 +7889,18 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
   EVT EltVT = VT;
   if (VT.isVector())
     EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
+  Constant *C;
   if (EltVT == MVT::f64) {
-    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
-    CV.push_back(C);
-    CV.push_back(C);
+    C = ConstantVector::getSplat(2, 
+                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
   } else {
-    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    C = ConstantVector::getSplat(4,
+               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
   }
-  Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, 16);
+                             false, false, false, 16);
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
@@ -8031,31 +7909,28 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
-  if (EltVT == MVT::f64) {
-    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
-    CV.push_back(C);
-    CV.push_back(C);
-  } else {
-    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    NumElts = VT.getVectorNumElements();
   }
-  Constant *C = ConstantVector::get(CV);
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+  else
+    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+  C = ConstantVector::getSplat(NumElts, C);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, 16);
+                             false, false, false, 16);
   if (VT.isVector()) {
+    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                    DAG.getNode(ISD::BITCAST, dl, XORVT,
                                 Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
+                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
   } else {
     return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
   }
@@ -8084,7 +7959,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // type, and that won't be f80 since that is not custom lowered.
 
   // First get the sign bit of second operand.
-  std::vector<Constant*> CV;
+  SmallVector<Constant*,4> CV;
   if (SrcVT == MVT::f64) {
     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
@@ -8098,7 +7973,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
 
   // Shift sign bit right or left if the two operands have different types.
@@ -8127,7 +8002,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                               MachinePointerInfo::getConstantPool(),
-                              false, false, 16);
+                              false, false, false, 16);
   SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
 
   // Or the value with the sign bit.
@@ -8191,8 +8066,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     // climbing the DAG back to the root, and it doesn't seem to be worth the
     // effort.
     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-           UE = Op.getNode()->use_end(); UI != UE; ++UI)
-      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
         goto default_case;
 
     if (ConstantSDNode *C =
@@ -8325,8 +8202,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
         unsigned BitWidth = Op0.getValueSizeInBits();
         unsigned AndBitWidth = And.getValueSizeInBits();
         if (BitWidth > AndBitWidth) {
-          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
+          APInt Zeros, Ones;
+          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
             return SDValue();
         }
@@ -8335,11 +8212,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
       }
   } else if (Op1.getOpcode() == ISD::Constant) {
     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
     SDValue AndLHS = Op0;
-    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
       LHS = AndLHS.getOperand(0);
       RHS = AndLHS.getOperand(1);
     }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+    }
   }
 
   if (LHS.getNode()) {
@@ -8466,9 +8351,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (isFP) {
     unsigned SSECC = 8;
     EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
 
-    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
     bool Swap = false;
 
     // SSE Condition code mapping:
@@ -8508,61 +8392,57 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
     if (SSECC == 8) {
       if (SetCCOpcode == ISD::SETUEQ) {
         SDValue UNORD, EQ;
-        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
-        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                            DAG.getConstant(3, MVT::i8));
+        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                         DAG.getConstant(0, MVT::i8));
         return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      }
-      else if (SetCCOpcode == ISD::SETONE) {
+      } else if (SetCCOpcode == ISD::SETONE) {
         SDValue ORD, NEQ;
-        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
-        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                          DAG.getConstant(7, MVT::i8));
+        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                          DAG.getConstant(4, MVT::i8));
         return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
       }
       llvm_unreachable("Illegal FP comparison");
     }
     // Handle all other FP comparisons here.
-    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                       DAG.getConstant(SSECC, MVT::i8));
   }
 
   // Break 256-bit integer vector compare into smaller ones.
-  if (!isFP && VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
     return Lower256IntVSETCC(Op, DAG);
 
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
-  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+  unsigned Opc = 0;
   bool Swap = false, Invert = false, FlipSigns = false;
 
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: break;
-  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
-  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
-  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
-  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
-  }
-
   switch (SetCCOpcode) {
   default: break;
   case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = EQOpc; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = GTOpc; break;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
   case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
   case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
   case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
   }
   if (Swap)
     std::swap(Op0, Op1);
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
     return SDValue();
-  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
     return SDValue();
 
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -8679,8 +8559,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC ||
-      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
@@ -8697,6 +8578,39 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond = Cmp;
       addTest = false;
     }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
   }
 
   if (addTest) {
@@ -8778,11 +8692,27 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest  = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
   SDValue CC;
+  bool Inverted = false;
 
   if (Cond.getOpcode() == ISD::SETCC) {
-    SDValue NewCond = LowerSETCC(Cond, DAG);
-    if (NewCond.getNode())
-      Cond = NewCond;
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isa<ConstantSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      SDValue NewCond = LowerSETCC(Cond, DAG);
+      if (NewCond.getNode())
+        Cond = NewCond;
+    }
   }
 #if 0
   // FIXME: LowerXALUO doesn't handle these!!
@@ -8803,8 +8733,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
-  if (Cond.getOpcode() == X86ISD::SETCC ||
-      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
@@ -8825,6 +8756,43 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
         break;
       }
     }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
   } else {
     unsigned CondOpc;
     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
@@ -8888,6 +8856,66 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       CC = DAG.getConstant(CCode, MVT::i8);
       Cond = Cond.getOperand(0).getOperand(1);
       addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
     }
   }
 
@@ -8926,7 +8954,7 @@ SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
-          EnableSegmentedStacks) &&
+          getTargetMachine().Options.EnableSegmentedStacks) &&
          "This should be used only on Windows targets or when segmented stacks "
          "are being used");
   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
@@ -8940,7 +8968,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  if (EnableSegmentedStacks) {
+  if (getTargetMachine().Options.EnableSegmentedStacks) {
     MachineFunction &MF = DAG.getMachineFunction();
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -9076,10 +9104,10 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
-    assert(!UseSoftFloat &&
+    assert(!getTargetMachine().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
                 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
-           Subtarget->hasXMM());
+           Subtarget->hasSSE1());
   }
 
   // Insert VAARG_64 node into the DAG
@@ -9106,7 +9134,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                      Chain,
                      VAARG,
                      MachinePointerInfo(),
-                     false, false, 0);
+                     false, false, false, 0);
 }
 
 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
@@ -9125,6 +9153,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+// getTargetVShiftNOde - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+  if (isa<ConstantSDNode>(ShAmt)) {
+    switch (Opc) {
+      default: llvm_unreachable("Unknown target vector shift node");
+      case X86ISD::VSHLI:
+      case X86ISD::VSRLI:
+      case X86ISD::VSRAI:
+        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+    }
+  }
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  // Need to build a vector containing shift amount
+  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+  SDValue ShOps[4];
+  ShOps[0] = ShAmt;
+  ShOps[1] = DAG.getConstant(0, MVT::i32);
+  ShOps[2] = DAG.getUNDEF(MVT::i32);
+  ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
 SDValue
 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
@@ -9159,7 +9224,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     unsigned Opc = 0;
     ISD::CondCode CC = ISD::SETCC_INVALID;
     switch (IntNo) {
-    default: break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse_comieq_ss:
     case Intrinsic::x86_sse2_comieq_sd:
       Opc = X86ISD::COMI;
@@ -9231,7 +9296,201 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  // XOP comparison intrinsics
+  case Intrinsic::x86_xop_vpcomltb:
+  case Intrinsic::x86_xop_vpcomltw:
+  case Intrinsic::x86_xop_vpcomltd:
+  case Intrinsic::x86_xop_vpcomltq:
+  case Intrinsic::x86_xop_vpcomltub:
+  case Intrinsic::x86_xop_vpcomltuw:
+  case Intrinsic::x86_xop_vpcomltud:
+  case Intrinsic::x86_xop_vpcomltuq:
+  case Intrinsic::x86_xop_vpcomleb:
+  case Intrinsic::x86_xop_vpcomlew:
+  case Intrinsic::x86_xop_vpcomled:
+  case Intrinsic::x86_xop_vpcomleq:
+  case Intrinsic::x86_xop_vpcomleub:
+  case Intrinsic::x86_xop_vpcomleuw:
+  case Intrinsic::x86_xop_vpcomleud:
+  case Intrinsic::x86_xop_vpcomleuq:
+  case Intrinsic::x86_xop_vpcomgtb:
+  case Intrinsic::x86_xop_vpcomgtw:
+  case Intrinsic::x86_xop_vpcomgtd:
+  case Intrinsic::x86_xop_vpcomgtq:
+  case Intrinsic::x86_xop_vpcomgtub:
+  case Intrinsic::x86_xop_vpcomgtuw:
+  case Intrinsic::x86_xop_vpcomgtud:
+  case Intrinsic::x86_xop_vpcomgtuq:
+  case Intrinsic::x86_xop_vpcomgeb:
+  case Intrinsic::x86_xop_vpcomgew:
+  case Intrinsic::x86_xop_vpcomged:
+  case Intrinsic::x86_xop_vpcomgeq:
+  case Intrinsic::x86_xop_vpcomgeub:
+  case Intrinsic::x86_xop_vpcomgeuw:
+  case Intrinsic::x86_xop_vpcomgeud:
+  case Intrinsic::x86_xop_vpcomgeuq:
+  case Intrinsic::x86_xop_vpcomeqb:
+  case Intrinsic::x86_xop_vpcomeqw:
+  case Intrinsic::x86_xop_vpcomeqd:
+  case Intrinsic::x86_xop_vpcomeqq:
+  case Intrinsic::x86_xop_vpcomequb:
+  case Intrinsic::x86_xop_vpcomequw:
+  case Intrinsic::x86_xop_vpcomequd:
+  case Intrinsic::x86_xop_vpcomequq:
+  case Intrinsic::x86_xop_vpcomneb:
+  case Intrinsic::x86_xop_vpcomnew:
+  case Intrinsic::x86_xop_vpcomned:
+  case Intrinsic::x86_xop_vpcomneq:
+  case Intrinsic::x86_xop_vpcomneub:
+  case Intrinsic::x86_xop_vpcomneuw:
+  case Intrinsic::x86_xop_vpcomneud:
+  case Intrinsic::x86_xop_vpcomneuq:
+  case Intrinsic::x86_xop_vpcomfalseb:
+  case Intrinsic::x86_xop_vpcomfalsew:
+  case Intrinsic::x86_xop_vpcomfalsed:
+  case Intrinsic::x86_xop_vpcomfalseq:
+  case Intrinsic::x86_xop_vpcomfalseub:
+  case Intrinsic::x86_xop_vpcomfalseuw:
+  case Intrinsic::x86_xop_vpcomfalseud:
+  case Intrinsic::x86_xop_vpcomfalseuq:
+  case Intrinsic::x86_xop_vpcomtrueb:
+  case Intrinsic::x86_xop_vpcomtruew:
+  case Intrinsic::x86_xop_vpcomtrued:
+  case Intrinsic::x86_xop_vpcomtrueq:
+  case Intrinsic::x86_xop_vpcomtrueub:
+  case Intrinsic::x86_xop_vpcomtrueuw:
+  case Intrinsic::x86_xop_vpcomtrueud:
+  case Intrinsic::x86_xop_vpcomtrueuq: {
+    unsigned CC = 0;
+    unsigned Opc = 0;
+
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_xop_vpcomltb:
+    case Intrinsic::x86_xop_vpcomltw:
+    case Intrinsic::x86_xop_vpcomltd:
+    case Intrinsic::x86_xop_vpcomltq:
+      CC = 0;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomltub:
+    case Intrinsic::x86_xop_vpcomltuw:
+    case Intrinsic::x86_xop_vpcomltud:
+    case Intrinsic::x86_xop_vpcomltuq:
+      CC = 0;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomleb:
+    case Intrinsic::x86_xop_vpcomlew:
+    case Intrinsic::x86_xop_vpcomled:
+    case Intrinsic::x86_xop_vpcomleq:
+      CC = 1;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomleub:
+    case Intrinsic::x86_xop_vpcomleuw:
+    case Intrinsic::x86_xop_vpcomleud:
+    case Intrinsic::x86_xop_vpcomleuq:
+      CC = 1;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomgtb:
+    case Intrinsic::x86_xop_vpcomgtw:
+    case Intrinsic::x86_xop_vpcomgtd:
+    case Intrinsic::x86_xop_vpcomgtq:
+      CC = 2;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomgtub:
+    case Intrinsic::x86_xop_vpcomgtuw:
+    case Intrinsic::x86_xop_vpcomgtud:
+    case Intrinsic::x86_xop_vpcomgtuq:
+      CC = 2;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomgeb:
+    case Intrinsic::x86_xop_vpcomgew:
+    case Intrinsic::x86_xop_vpcomged:
+    case Intrinsic::x86_xop_vpcomgeq:
+      CC = 3;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomgeub:
+    case Intrinsic::x86_xop_vpcomgeuw:
+    case Intrinsic::x86_xop_vpcomgeud:
+    case Intrinsic::x86_xop_vpcomgeuq:
+      CC = 3;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomeqb:
+    case Intrinsic::x86_xop_vpcomeqw:
+    case Intrinsic::x86_xop_vpcomeqd:
+    case Intrinsic::x86_xop_vpcomeqq:
+      CC = 4;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomequb:
+    case Intrinsic::x86_xop_vpcomequw:
+    case Intrinsic::x86_xop_vpcomequd:
+    case Intrinsic::x86_xop_vpcomequq:
+      CC = 4;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomneb:
+    case Intrinsic::x86_xop_vpcomnew:
+    case Intrinsic::x86_xop_vpcomned:
+    case Intrinsic::x86_xop_vpcomneq:
+      CC = 5;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomneub:
+    case Intrinsic::x86_xop_vpcomneuw:
+    case Intrinsic::x86_xop_vpcomneud:
+    case Intrinsic::x86_xop_vpcomneuq:
+      CC = 5;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomfalseb:
+    case Intrinsic::x86_xop_vpcomfalsew:
+    case Intrinsic::x86_xop_vpcomfalsed:
+    case Intrinsic::x86_xop_vpcomfalseq:
+      CC = 6;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomfalseub:
+    case Intrinsic::x86_xop_vpcomfalseuw:
+    case Intrinsic::x86_xop_vpcomfalseud:
+    case Intrinsic::x86_xop_vpcomfalseuq:
+      CC = 6;
+      Opc = X86ISD::VPCOMU;
+      break;
+    case Intrinsic::x86_xop_vpcomtrueb:
+    case Intrinsic::x86_xop_vpcomtruew:
+    case Intrinsic::x86_xop_vpcomtrued:
+    case Intrinsic::x86_xop_vpcomtrueq:
+      CC = 7;
+      Opc = X86ISD::VPCOM;
+      break;
+    case Intrinsic::x86_xop_vpcomtrueub:
+    case Intrinsic::x86_xop_vpcomtrueuw:
+    case Intrinsic::x86_xop_vpcomtrueud:
+    case Intrinsic::x86_xop_vpcomtrueuq:
+      CC = 7;
+      Opc = X86ISD::VPCOMU;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
+                       DAG.getConstant(CC, MVT::i8));
+  }
+
   // Arithmetic intrinsics.
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse3_hadd_ps:
   case Intrinsic::x86_sse3_hadd_pd:
   case Intrinsic::x86_avx_hadd_ps_256:
@@ -9244,6 +9503,62 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx_hsub_pd_256:
     return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_ssse3_phadd_w_128:
+  case Intrinsic::x86_ssse3_phadd_d_128:
+  case Intrinsic::x86_avx2_phadd_w:
+  case Intrinsic::x86_avx2_phadd_d:
+    return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_ssse3_phsub_w_128:
+  case Intrinsic::x86_ssse3_phsub_d_128:
+  case Intrinsic::x86_avx2_phsub_w:
+  case Intrinsic::x86_avx2_phsub_d:
+    return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q_256:
+    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_ssse3_psign_b_128:
+  case Intrinsic::x86_ssse3_psign_w_128:
+  case Intrinsic::x86_ssse3_psign_d_128:
+  case Intrinsic::x86_avx2_psign_b:
+  case Intrinsic::x86_avx2_psign_w:
+  case Intrinsic::x86_avx2_psign_d:
+    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse41_insertps:
+    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::x86_avx_vperm2f128_ps_256:
+  case Intrinsic::x86_avx_vperm2f128_pd_256:
+  case Intrinsic::x86_avx_vperm2f128_si_256:
+  case Intrinsic::x86_avx2_vperm2i128:
+    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::x86_avx_vpermil_ps:
+  case Intrinsic::x86_avx_vpermil_pd:
+  case Intrinsic::x86_avx_vpermil_ps_256:
+  case Intrinsic::x86_avx_vpermil_pd_256:
+    return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -9310,16 +9625,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
+  // SSE/AVX shift intrinsics
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
   case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
+  // Fix vector shift instructions where the last operand is a non-immediate
+  // i32 value.
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
@@ -9333,79 +9685,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
       return SDValue();
 
     unsigned NewIntNo = 0;
-    EVT ShAmtVT = MVT::v4i32;
     switch (IntNo) {
-    case Intrinsic::x86_sse2_pslli_w:
-      NewIntNo = Intrinsic::x86_sse2_psll_w;
-      break;
-    case Intrinsic::x86_sse2_pslli_d:
-      NewIntNo = Intrinsic::x86_sse2_psll_d;
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntNo = Intrinsic::x86_mmx_psll_w;
       break;
-    case Intrinsic::x86_sse2_pslli_q:
-      NewIntNo = Intrinsic::x86_sse2_psll_q;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntNo = Intrinsic::x86_mmx_psll_d;
       break;
-    case Intrinsic::x86_sse2_psrli_w:
-      NewIntNo = Intrinsic::x86_sse2_psrl_w;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntNo = Intrinsic::x86_mmx_psll_q;
       break;
-    case Intrinsic::x86_sse2_psrli_d:
-      NewIntNo = Intrinsic::x86_sse2_psrl_d;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntNo = Intrinsic::x86_mmx_psrl_w;
       break;
-    case Intrinsic::x86_sse2_psrli_q:
-      NewIntNo = Intrinsic::x86_sse2_psrl_q;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntNo = Intrinsic::x86_mmx_psrl_d;
       break;
-    case Intrinsic::x86_sse2_psrai_w:
-      NewIntNo = Intrinsic::x86_sse2_psra_w;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntNo = Intrinsic::x86_mmx_psrl_q;
       break;
-    case Intrinsic::x86_sse2_psrai_d:
-      NewIntNo = Intrinsic::x86_sse2_psra_d;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntNo = Intrinsic::x86_mmx_psra_w;
       break;
-    default: {
-      ShAmtVT = MVT::v2i32;
-      switch (IntNo) {
-      case Intrinsic::x86_mmx_pslli_w:
-        NewIntNo = Intrinsic::x86_mmx_psll_w;
-        break;
-      case Intrinsic::x86_mmx_pslli_d:
-        NewIntNo = Intrinsic::x86_mmx_psll_d;
-        break;
-      case Intrinsic::x86_mmx_pslli_q:
-        NewIntNo = Intrinsic::x86_mmx_psll_q;
-        break;
-      case Intrinsic::x86_mmx_psrli_w:
-        NewIntNo = Intrinsic::x86_mmx_psrl_w;
-        break;
-      case Intrinsic::x86_mmx_psrli_d:
-        NewIntNo = Intrinsic::x86_mmx_psrl_d;
-        break;
-      case Intrinsic::x86_mmx_psrli_q:
-        NewIntNo = Intrinsic::x86_mmx_psrl_q;
-        break;
-      case Intrinsic::x86_mmx_psrai_w:
-        NewIntNo = Intrinsic::x86_mmx_psra_w;
-        break;
-      case Intrinsic::x86_mmx_psrai_d:
-        NewIntNo = Intrinsic::x86_mmx_psra_d;
-        break;
-      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-      }
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntNo = Intrinsic::x86_mmx_psra_d;
       break;
-    }
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     }
 
     // The vector shift intrinsics with scalars uses 32b shift amounts but
     // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
     // to be zero.
-    SDValue ShOps[4];
-    ShOps[0] = ShAmt;
-    ShOps[1] = DAG.getConstant(0, MVT::i32);
-    if (ShAmtVT == MVT::v4i32) {
-      ShOps[2] = DAG.getUNDEF(MVT::i32);
-      ShOps[3] = DAG.getUNDEF(MVT::i32);
-      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
-    } else {
-      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
+                         DAG.getConstant(0, MVT::i32));
 // FIXME this must be lowered to get rid of the invalid type.
-    }
 
     EVT VT = Op.getValueType();
     ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
@@ -9432,13 +9745,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
     return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                    FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, 0);
+                       MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                     RetAddrFI, MachinePointerInfo(), false, false, 0);
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
 }
 
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
@@ -9453,7 +9766,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo(),
-                            false, false, 0);
+                            false, false, false, 0);
   return FrameAddr;
 }
 
@@ -9685,7 +9998,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 
   // Load FP Control Word from stack slot
   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
-                            MachinePointerInfo(), false, false, 0);
+                            MachinePointerInfo(), false, false, false, 0);
 
   // Transform as necessary
   SDValue CWD1 =
@@ -9745,7 +10058,8 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
+                                                SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -9753,26 +10067,41 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
     OpVT = MVT::i32;
     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   }
 
-  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  // Issue a bsr (scan bits in reverse).
   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // And xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  Op = Op.getOperand(0);
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
     Op,
-    DAG.getConstant(NumBits, OpVT),
+    DAG.getConstant(NumBits, VT),
     DAG.getConstant(X86::COND_E, MVT::i8),
     Op.getValue(1)
   };
-  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
-
-  if (VT == MVT::i8)
-    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
-  return Op;
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
 }
 
 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -9824,49 +10153,49 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
     return Lower256IntArith(Op, DAG);
 
-  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+         "Only know how to lower V2I64/V4I64 multiply");
+
   DebugLoc dl = Op.getDebugLoc();
 
-  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
-  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
-  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
-  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
-  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
   //
-  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
-  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
   //  return AloBlo + AloBhi + AhiBlo;
 
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
-  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                       A, DAG.getConstant(32, MVT::i32));
-  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                       B, DAG.getConstant(32, MVT::i32));
-  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, B);
-  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       A, Bhi);
-  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
-                       Ahi, B);
-  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                       AloBhi, DAG.getConstant(32, MVT::i32));
-  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                       AhiBlo, DAG.getConstant(32, MVT::i32));
+  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
+
+  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+
+  // Bit cast to 32-bit vectors for MULUDQ
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+
   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
-  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
-  return Res;
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
@@ -9877,12 +10206,183 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   SDValue Amt = Op.getOperand(1);
   LLVMContext *Context = DAG.getContext();
 
-  if (!Subtarget->hasXMMInt())
+  if (!Subtarget->hasSSE2())
     return SDValue();
 
+  // Optimize shl/srl/sra with constant shift amount.
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+
+      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+          (Subtarget->hasAVX2() &&
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        if (Op.getOpcode() == ISD::SHL)
+          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRL)
+          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+      }
+
+      if (VT == MVT::v16i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+      }
+
+      if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+      }
+    }
+  }
+
+  // Lower SHL with variable shift amount.
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
+                     DAG.getConstant(23, MVT::i32));
+
+    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
+    Constant *C = ConstantDataVector::get(*Context, CV);
+    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                                 MachinePointerInfo::getConstantPool(),
+                                 false, false, false, 16);
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+
+    // a = a << 5;
+    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
+                     DAG.getConstant(5, MVT::i32));
+    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+    // Turn 'a' into a mask suitable for VSELECT
+    SDValue VSelM = DAG.getConstant(0x80, VT);
+    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    SDValue CM1 = DAG.getConstant(0x0f, VT);
+    SDValue CM2 = DAG.getConstant(0x3f, VT);
+
+    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(4, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(2, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // return VSELECT(r, r+r, a);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
+    return R;
+  }
+
   // Decompose 256-bit shifts into smaller 128-bit shifts.
   if (VT.getSizeInBits() == 256) {
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
     MVT EltVT = VT.getVectorElementType().getSimpleVT();
     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
@@ -9897,9 +10397,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
       // Constant shift amount
       SmallVector<SDValue, 4> Amt1Csts;
       SmallVector<SDValue, 4> Amt2Csts;
-      for (int i = 0; i < NumElems/2; ++i)
+      for (unsigned i = 0; i != NumElems/2; ++i)
         Amt1Csts.push_back(Amt->getOperand(i));
-      for (int i = NumElems/2; i < NumElems; ++i)
+      for (unsigned i = NumElems/2; i != NumElems; ++i)
         Amt2Csts.push_back(Amt->getOperand(i));
 
       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
@@ -9921,120 +10421,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
   }
 
-  // Optimize shl/srl/sra with constant shift amount.
-  if (isSplatVector(Amt.getNode())) {
-    SDValue SclrAmt = Amt->getOperand(0);
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
-      uint64_t ShiftAmt = C->getZExtValue();
-
-      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-    }
-  }
-
-  // Lower SHL with variable shift amount.
-  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
-    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                     Op.getOperand(1), DAG.getConstant(23, MVT::i32));
-
-    ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
-
-    std::vector<Constant*> CV(4, CI);
-    Constant *C = ConstantVector::get(CV);
-    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                                 MachinePointerInfo::getConstantPool(),
-                                 false, false, 16);
-
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
-    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
-    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
-    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
-  }
-  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
-    // a = a << 5;
-    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                     Op.getOperand(1), DAG.getConstant(5, MVT::i32));
-
-    ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
-    ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
-
-    std::vector<Constant*> CVM1(16, CM1);
-    std::vector<Constant*> CVM2(16, CM2);
-    Constant *C = ConstantVector::get(CVM1);
-    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                            MachinePointerInfo::getConstantPool(),
-                            false, false, 16);
-
-    // r = pblendv(r, psllw(r & (char16)15, 4), a);
-    M = DAG.getNode(ISD::AND, dl, VT, R, M);
-    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
-                    DAG.getConstant(4, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
-    // a += a
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-
-    C = ConstantVector::get(CVM2);
-    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                    MachinePointerInfo::getConstantPool(),
-                    false, false, 16);
-
-    // r = pblendv(r, psllw(r & (char16)63, 2), a);
-    M = DAG.getNode(ISD::AND, dl, VT, R, M);
-    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
-                    DAG.getConstant(2, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
-    // a += a
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-
-    // return pblendv(r, r+r, a);
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
-                    R, DAG.getNode(ISD::ADD, dl, VT, R, R));
-    return R;
-  }
   return SDValue();
 }
 
@@ -10113,46 +10499,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
-  SDNode* Node = Op.getNode();
-  EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-  EVT VT = Node->getValueType(0);
-  if (Subtarget->hasXMMInt() && VT.isVector()) {
-    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
-                        ExtraVT.getScalarType().getSizeInBits();
-    SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
-
-    unsigned SHLIntrinsicsID = 0;
-    unsigned SRAIntrinsicsID = 0;
-    switch (VT.getSimpleVT().SimpleTy) {
-      default:
-        return SDValue();
-      case MVT::v4i32: {
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
-        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
-        break;
-      }
-      case MVT::v8i16: {
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
-        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
-        break;
-      }
-    }
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  EVT VT = Op.getValueType();
 
-    SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(SHLIntrinsicsID, MVT::i32),
-                         Node->getOperand(0), ShAmt);
+  if (!Subtarget->hasSSE2() || !VT.isVector())
+    return SDValue();
 
-    // In case of 1 bit sext, no need to shr
-    if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
+  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                      ExtraVT.getScalarType().getSizeInBits();
+  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
 
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(SRAIntrinsicsID, MVT::i32),
-                       Tmp1, ShAmt);
+  switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v8i32:
+    case MVT::v16i16:
+      if (!Subtarget->hasAVX())
+        return SDValue();
+      if (!Subtarget->hasAVX2()) {
+        // needs to be split
+        int NumElems = VT.getVectorNumElements();
+        SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+        SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+        // Extract the LHS vectors
+        SDValue LHS = Op.getOperand(0);
+        SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+        MVT EltVT = VT.getVectorElementType().getSimpleVT();
+        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+        EVT ExtraEltVT = ExtraVT.getVectorElementType();
+        int ExtraNumElems = ExtraVT.getVectorNumElements();
+        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+                                   ExtraNumElems/2);
+        SDValue Extra = DAG.getValueType(ExtraVT);
+
+        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+      }
+      // fall through
+    case MVT::v4i32:
+    case MVT::v8i16: {
+      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
+                                         Op.getOperand(0), ShAmt, DAG);
+      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+    }
   }
-
-  return SDValue();
 }
 
 
@@ -10161,7 +10559,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
 
   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
   // There isn't any reason to disable it if the target processor supports it.
-  if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
+  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
     SDValue Chain = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, MVT::i32);
     SDValue Ops[] = {
@@ -10215,7 +10613,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
     // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
     // no-sse2). There isn't any reason to disable it if the target processor
     // supports it.
-    if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
@@ -10246,8 +10644,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   unsigned Reg = 0;
   unsigned size = 0;
   switch(T.getSimpleVT().SimpleTy) {
-  default:
-    assert(false && "Invalid value type!");
+  default: llvm_unreachable("Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   case MVT::i16: Reg = X86::AX;  size = 2; break;
   case MVT::i32: Reg = X86::EAX; size = 4; break;
@@ -10295,7 +10692,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
   EVT DstVT = Op.getValueType();
-  assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
+  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -10365,7 +10762,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   unsigned Opc;
   bool ExtraOp = false;
   switch (Op.getOpcode()) {
-  default: assert(0 && "Invalid code");
+  default: llvm_unreachable("Invalid code");
   case ISD::ADDC: Opc = X86ISD::ADD; break;
   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
   case ISD::SUBC: Opc = X86ISD::SUB; break;
@@ -10432,6 +10829,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::SRA:
@@ -10506,8 +10904,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   DebugLoc dl = N->getDebugLoc();
   switch (N->getOpcode()) {
   default:
-    assert(false && "Do not know how to custom type legalize this operation!");
-    return;
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
   case ISD::SIGN_EXTEND_INREG:
   case ISD::ADDC:
   case ISD::ADDE:
@@ -10515,15 +10912,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SUBE:
     // We don't want to expand or promote these.
     return;
-  case ISD::FP_TO_SINT: {
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
+      return;
+
     std::pair<SDValue,SDValue> Vals =
-        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
     if (FIST.getNode() != 0) {
       EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
-      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
-                                    MachinePointerInfo(), false, false, 0));
+      if (StackSlot.getNode() != 0)
+        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+                                      MachinePointerInfo(),
+                                      false, false, false, 0));
+      else
+        Results.push_back(FIST);
     }
     return;
   }
@@ -10657,15 +11064,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
-  case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
-  case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
-  case X86ISD::PSIGND:             return "X86ISD::PSIGND";
+  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
+  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
+  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
+  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
-  case X86ISD::FHADD:              return "X86ISD::FHADD";
-  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
@@ -10681,18 +11092,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
   case X86ISD::VSRL:               return "X86ISD::VSRL";
-  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
-  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
-  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
-  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
-  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
-  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
-  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
-  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
-  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
-  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
@@ -10705,54 +11115,39 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::ANDN:               return "X86ISD::ANDN";
+  case X86ISD::BLSI:               return "X86ISD::BLSI";
+  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
+  case X86ISD::BLSR:               return "X86ISD::BLSR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
   case X86ISD::PALIGN:             return "X86ISD::PALIGN";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
-  case X86ISD::PSHUFHW_LD:         return "X86ISD::PSHUFHW_LD";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
-  case X86ISD::PSHUFLW_LD:         return "X86ISD::PSHUFLW_LD";
-  case X86ISD::SHUFPS:             return "X86ISD::SHUFPS";
-  case X86ISD::SHUFPD:             return "X86ISD::SHUFPD";
+  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
-  case X86ISD::MOVHLPD:            return "X86ISD::MOVHLPD";
   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
-  case X86ISD::MOVSHDUP_LD:        return "X86ISD::MOVSHDUP_LD";
-  case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
-  case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
-  case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
-  case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
-  case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
-  case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
-  case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
-  case X86ISD::PUNPCKLWD:          return "X86ISD::PUNPCKLWD";
-  case X86ISD::PUNPCKLDQ:          return "X86ISD::PUNPCKLDQ";
-  case X86ISD::PUNPCKLQDQ:         return "X86ISD::PUNPCKLQDQ";
-  case X86ISD::PUNPCKHBW:          return "X86ISD::PUNPCKHBW";
-  case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
-  case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
-  case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
-  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
-  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
-  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
-  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
-  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
+  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
+  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
   }
 }
 
@@ -10855,21 +11250,21 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
+    return false;
 
   // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT) ||
+          isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
           isPSHUFDMask(M, VT) ||
           isPSHUFHWMask(M, VT) ||
           isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
-          isUNPCKLMask(M, VT) ||
-          isUNPCKHMask(M, VT) ||
-          isUNPCKL_v_undef_Mask(M, VT) ||
-          isUNPCKH_v_undef_Mask(M, VT));
+          isPALIGNRMask(M, VT, Subtarget) ||
+          isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
+          isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
+          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
+          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
 }
 
 bool
@@ -10882,8 +11277,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
   if (NumElts == 4 && VT.getSizeInBits() == 128) {
     return (isMOVLMask(Mask, VT)  ||
             isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT) ||
-            isCommutedSHUFPMask(Mask, VT));
+            isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
+            isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
   }
   return false;
 }
@@ -10902,7 +11297,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
                                                        unsigned CXchgOpc,
                                                        unsigned notOpc,
                                                        unsigned EAXreg,
-                                                       TargetRegisterClass *RC,
+                                                 const TargetRegisterClass *RC,
                                                        bool invSrc) const {
   // For the atomic bitwise operator, we generate
   //   thisMBB:
@@ -11274,7 +11669,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
 MachineBasicBlock *
 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                             unsigned numArgs, bool memArg) const {
-  assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
+  assert(Subtarget->hasSSE42() &&
          "Target must have SSE4.2 or AVX features enabled");
 
   DebugLoc dl = MI->getDebugLoc();
@@ -11679,6 +12074,42 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
   return EndMBB;
 }
 
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
@@ -11708,7 +12139,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  if (!MI->killsRegister(X86::EFLAGS)) {
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
     sinkMBB->addLiveIn(X86::EFLAGS);
   }
@@ -11753,7 +12186,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
-  assert(EnableSegmentedStacks);
+  assert(getTargetMachine().Options.EnableSegmentedStacks);
 
   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -11785,6 +12218,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
     sizeVReg = MI->getOperand(1).getReg(),
     physSPReg = Is64Bit ? X86::RSP : X86::ESP;
 
@@ -11802,33 +12236,39 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
     .addReg(tmpSPVReg).addReg(sizeVReg);
   BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
-    .addReg(tmpSPVReg);
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
   BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
-    .addReg(tmpSPVReg);
+    .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
-    .addReg(tmpSPVReg);
+    .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask =
+    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Is64Bit) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
-    .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+      .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+      .addRegMask(RegMask)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
       .addImm(12);
     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space");
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
@@ -11926,6 +12366,11 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI->getOperand(3).isGlobal() && "This should be a global");
 
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask =
+    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV64rm), X86::RDI)
@@ -11936,6 +12381,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV32rm), X86::EAX)
@@ -11946,6 +12392,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV32rm), X86::EAX)
@@ -11956,6 +12403,7 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
     .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   }
 
   MI->eraseFromParent(); // The pseudo instruction is gone now.
@@ -11966,30 +12414,14 @@ MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: assert(0 && "Unexpected instr type to insert");
+  default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
-    assert(0 && "TAILJMP64 would not be touched here.");
+    llvm_unreachable("TAILJMP64 would not be touched here.");
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
-    // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset.
-    // On AMD64, additional defs should be added before register allocation.
-    if (!Subtarget->isTargetWin64()) {
-      MI->addRegisterDefined(X86::RSI);
-      MI->addRegisterDefined(X86::RDI);
-      MI->addRegisterDefined(X86::XMM6);
-      MI->addRegisterDefined(X86::XMM7);
-      MI->addRegisterDefined(X86::XMM8);
-      MI->addRegisterDefined(X86::XMM9);
-      MI->addRegisterDefined(X86::XMM10);
-      MI->addRegisterDefined(X86::XMM11);
-      MI->addRegisterDefined(X86::XMM12);
-      MI->addRegisterDefined(X86::XMM13);
-      MI->addRegisterDefined(X86::XMM14);
-      MI->addRegisterDefined(X86::XMM15);
-    }
     return BB;
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
@@ -12294,11 +12726,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 //===----------------------------------------------------------------------===//
 
 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                        APInt &KnownZero,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
@@ -12307,7 +12739,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   switch (Opc) {
   default: break;
   case X86ISD::ADD:
@@ -12326,8 +12758,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
       break;
     // Fallthrough
   case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
-                                       Mask.getBitWidth() - 1);
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     break;
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -12339,18 +12770,20 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     case Intrinsic::x86_sse2_movmsk_pd:
     case Intrinsic::x86_avx_movmsk_pd_256:
     case Intrinsic::x86_mmx_pmovmskb:
-    case Intrinsic::x86_sse2_pmovmskb_128: {
+    case Intrinsic::x86_sse2_pmovmskb_128:
+    case Intrinsic::x86_avx2_pmovmskb: {
       // High bits of movmskp{s|d}, pmovmskb are known zero.
       switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
       }
-      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
-                                        Mask.getBitWidth() - NumLoBits);
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
       break;
     }
     }
@@ -12418,7 +12851,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
 
 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI) {
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget* Subtarget) {
   DebugLoc dl = N->getDebugLoc();
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
@@ -12454,9 +12888,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
         return SDValue();
 
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+      SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                Ld->getMemoryVT(),
+                                Ld->getPointerInfo(),
+                                Ld->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
+      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+    } 
+
     // Emit a zeroed vector and insert the desired subvector on its
     // first half.
-    SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
                          DAG.getConstant(0, MVT::i32), DAG, dl);
     return DCI.CombineTo(N, InsV);
@@ -12501,7 +12949,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
-    return PerformShuffleCombine256(N, DAG, DCI);
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
   // Only handle 128 wide vector from here on.
   if (VT.getSizeInBits() != 128)
@@ -12517,11 +12965,185 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
 }
 
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX()) return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                          DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                          DAG.getIntPtrConstant(2));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // PSHUFD
+    int ShufMask1[] = {0, 2, 0, 0};
+
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
+                                ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
+                                ShufMask1);
+
+    // MOVLHPS
+    int ShufMask2[] = {0, 1, 4, 5};
+
+    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
+  }
+  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                          DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                          DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
+
+    // PSHUFB
+    int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13, 
+                      -1, -1, -1, -1, -1, -1, -1, -1};
+
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
+                                DAG.getUNDEF(MVT::v16i8),
+                                ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
+                                DAG.getUNDEF(MVT::v16i8),
+                                ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // MOVLHPS
+    int ShufMask2[] = {0, 1, 4, 5};
+
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
+  }
+
+  return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT VT = InVec.getValueType();
+
+  bool HasShuffleIntoBitcast = false;
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+    HasShuffleIntoBitcast = true;
+  }
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = VT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  if (HasShuffleIntoBitcast) {
+    // If there's a bitcast before the shuffle, check if the load type and
+    // alignment is valid.
+    unsigned Align = LN0->getAlignment();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NewAlign = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+      return SDValue();
+  }
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  DebugLoc dl = N->getDebugLoc();
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
-                                                const TargetLowering &TLI) {
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+  if (NewOp.getNode())
+    return NewOp;
+
   SDValue InputVector = N->getOperand(0);
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
@@ -12582,6 +13204,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     unsigned EltSize =
         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
 
     SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
@@ -12590,7 +13213,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     // Load the scalar.
     SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
                                      ScalarAddr, MachinePointerInfo(),
-                                     false, false, 0);
+                                     false, false, false, 0);
 
     // Replace the exact with the load.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
@@ -12603,7 +13226,10 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
+
+
   DebugLoc DL = N->getDebugLoc();
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
@@ -12617,7 +13243,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // ignored in unsafe-math mode).
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
       VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
-      (Subtarget->hasXMMInt() ||
+      (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
@@ -12632,7 +13258,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
-          if (!UnsafeFPMath &&
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
@@ -12642,7 +13268,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       case ISD::SETOLE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMIN;
@@ -12660,7 +13286,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       case ISD::SETOGE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMAX;
@@ -12670,7 +13296,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
-          if (!UnsafeFPMath &&
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
@@ -12696,7 +13322,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
@@ -12706,7 +13332,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         break;
       case ISD::SETUGT:
         // Converting this to a min would handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
           break;
         Opcode = X86ISD::FMIN;
@@ -12731,7 +13357,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
-        if (!UnsafeFPMath &&
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
@@ -12848,6 +13474,57 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       }
   }
 
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
+  // If we know that this node is legal then we know that it is going to be
+  // matched by one of the SSE/AVX BLEND instructions. These instructions only
+  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+  // to simplify previous instructions.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() &&
+      TLI.isOperationLegal(ISD::VSELECT, VT)) {
+    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+
   return SDValue();
 }
 
@@ -13042,7 +13719,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   // since the result of setcc_c is all zero's or all ones.
-  if (N1C && N0.getOpcode() == ISD::AND &&
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
@@ -13058,26 +13736,46 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (isSplatVector(N1.getNode())) {
+    assert(N0.getValueType().isVector() && "Invalid vector shift type");
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+    // We shift all of the values by one. In many cases we do not have
+    // hardware support for this operation. This is better expressed as an ADD
+    // of two values.
+    if (N1C && (1 == N1C->getZExtValue())) {
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+    }
+  }
+
   return SDValue();
 }
 
 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
 ///                       when possible.
 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
-  if (!VT.isVector() && VT.isInteger() &&
-      N->getOpcode() == ISD::SHL)
-    return PerformSHLCombine(N, DAG);
+  if (N->getOpcode() == ISD::SHL) {
+    SDValue V = PerformSHLCombine(N, DAG);
+    if (V.getNode()) return V;
+  }
 
   // On X86 with SSE2 support, we can transform this to a vector shift if
   // all elements are shifted by the same amount.  We can't do this in legalize
   // because the a constant vector is typically transformed to a constant pool
   // so we have no knowledge of the shift amount.
-  if (!Subtarget->hasXMMInt())
+  if (!Subtarget->hasSSE2())
     return SDValue();
 
-  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget->hasAVX2() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
     return SDValue();
 
   SDValue ShAmtOp = N->getOperand(1);
@@ -13093,6 +13791,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
       BaseShAmt = Arg;
       break;
     }
+    // Handle the case where the build_vector is all undef
+    // FIXME: Should DAG allow this?
+    if (i == NumElts)
+      return SDValue();
+
     for (; i != NumElts; ++i) {
       SDValue Arg = ShAmtOp.getOperand(i);
       if (Arg.getOpcode() == ISD::UNDEF) continue;
@@ -13119,9 +13822,16 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
            BaseShAmt = InVec.getOperand(1);
        }
     }
-    if (BaseShAmt.getNode() == 0)
+    if (BaseShAmt.getNode() == 0) {
+      // Don't create instructions with illegal types after legalize
+      // types has run.
+      if (!DAG.getTargetLoweringInfo().isTypeLegal(EltVT) &&
+          !DCI.isBeforeLegalize())
+        return SDValue();
+
       BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
                               DAG.getIntPtrConstant(0));
+    }
   } else
     return SDValue();
 
@@ -13136,47 +13846,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Unknown shift opcode!");
-    break;
   case ISD::SHL:
-    if (VT == MVT::v2i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v2i64:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v4i64:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
   case ISD::SRA:
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
   case ISD::SRL:
-    if (VT == MVT::v2i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT ==  MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v2i64:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v4i64:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
   }
-  return SDValue();
 }
 
 
@@ -13190,7 +13891,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
-  if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
@@ -13300,7 +14001,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   EVT VT = N->getValueType(0);
 
-  // Create ANDN instructions
+  // Create ANDN, BLSI, and BLSR instructions
+  // BLSI is X & (-X)
+  // BLSR is X & (X-1)
   if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
@@ -13313,6 +14016,26 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
     if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
       return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
 
+    // Check LHS for neg
+    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+        isZero(N0.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+    // Check RHS for neg
+    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+        isZero(N1.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+    // Check LHS for X-1
+    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+        isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+    // Check RHS for X-1
+    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+        isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+
     return SDValue();
   }
 
@@ -13353,98 +14076,87 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
     return R;
 
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
-    return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // look for psign/blend
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
-    if (VT == MVT::v2i64) {
-      // Canonicalize pandn to RHS
-      if (N0.getOpcode() == X86ISD::ANDNP)
-        std::swap(N0, N1);
-      // or (and (m, x), (pandn m, y))
-      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
-        SDValue Mask = N1.getOperand(0);
-        SDValue X    = N1.getOperand(1);
-        SDValue Y;
-        if (N0.getOperand(0) == Mask)
-          Y = N0.getOperand(1);
-        if (N0.getOperand(1) == Mask)
-          Y = N0.getOperand(0);
-
-        // Check to see if the mask appeared in both the AND and ANDNP and
-        if (!Y.getNode())
-          return SDValue();
-
-        // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
-        if (Mask.getOpcode() != ISD::BITCAST ||
-            X.getOpcode() != ISD::BITCAST ||
-            Y.getOpcode() != ISD::BITCAST)
-          return SDValue();
-
-        // Look through mask bitcast.
-        Mask = Mask.getOperand(0);
-        EVT MaskVT = Mask.getValueType();
-
-        // Validate that the Mask operand is a vector sra node.  The sra node
-        // will be an intrinsic.
-        if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-          return SDValue();
-
-        // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-        // there is no psrai.b
-        switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
-        case Intrinsic::x86_sse2_psrai_w:
-        case Intrinsic::x86_sse2_psrai_d:
-          break;
-        default: return SDValue();
-        }
-
-        // Check that the SRA is all signbits.
-        SDValue SraC = Mask.getOperand(2);
-        unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
-        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
-        if ((SraAmt + 1) != EltBits)
-          return SDValue();
+  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+    if (!Subtarget->hasSSSE3() ||
+        (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
+      return SDValue();
 
-        DebugLoc DL = N->getDebugLoc();
+    // Canonicalize pandn to RHS
+    if (N0.getOpcode() == X86ISD::ANDNP)
+      std::swap(N0, N1);
+    // or (and (m, y), (pandn m, x))
+    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+      SDValue Mask = N1.getOperand(0);
+      SDValue X    = N1.getOperand(1);
+      SDValue Y;
+      if (N0.getOperand(0) == Mask)
+        Y = N0.getOperand(1);
+      if (N0.getOperand(1) == Mask)
+        Y = N0.getOperand(0);
+
+      // Check to see if the mask appeared in both the AND and ANDNP and
+      if (!Y.getNode())
+        return SDValue();
 
-        // Now we know we at least have a plendvb with the mask val.  See if
-        // we can form a psignb/w/d.
-        // psign = x.type == y.type == mask.type && y = sub(0, x);
+      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+      // Look through mask bitcast.
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
         X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
         Y = Y.getOperand(0);
-        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
-            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
-            X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
-          unsigned Opc = 0;
-          switch (EltBits) {
-          case 8: Opc = X86ISD::PSIGNB; break;
-          case 16: Opc = X86ISD::PSIGNW; break;
-          case 32: Opc = X86ISD::PSIGND; break;
-          default: break;
-          }
-          if (Opc) {
-            SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
-            return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
-          }
-        }
-        // PBLENDVB only available on SSE 4.1
-        if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
-          return SDValue();
-
-        X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
-        Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
-        Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
-        Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
-        return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
+
+      EVT MaskVT = Mask.getValueType();
+
+      // Validate that the Mask operand is a vector sra node.
+      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+      // there is no psrai.b
+      if (Mask.getOpcode() != X86ISD::VSRAI)
+        return SDValue();
+
+      // Check that the SRA is all signbits.
+      SDValue SraC = Mask.getOperand(1);
+      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
+      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+      if ((SraAmt + 1) != EltBits)
+        return SDValue();
+
+      DebugLoc DL = N->getDebugLoc();
+
+      // Now we know we at least have a plendvb with the mask val.  See if
+      // we can form a psignb/w/d.
+      // psign = x.type == y.type == mask.type && y = sub(0, x);
+      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+               "Unsupported VT for PSIGN");
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
       }
+      // PBLENDVB only available on SSE 4.1
+      if (!Subtarget->hasSSE41())
+        return SDValue();
+
+      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
     }
   }
 
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
@@ -13500,6 +14212,36 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
+
+  // Create BLSMSK instructions by finding X ^ (X-1)
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+      isAllOnes(N0.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
+
+  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+      isAllOnes(N1.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
+
+  return SDValue();
+}
+
 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
@@ -13515,7 +14257,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   // shuffle. We need SSE4 for the shuffles.
   // TODO: It is possible to support ZExt by zeroing the undef values
   // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+  if (RegVT.isVector() && RegVT.isInteger() &&
+      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
     assert(MemVT != RegVT && "Cannot extend to the same type");
     assert(MemVT.isVector() && "Must load a vector from memory");
 
@@ -13553,7 +14296,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
     SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
                                   Ld->getBasePtr(),
                                   Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->getAlignment());
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
 
     // Insert the word loaded into a vector.
     SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
@@ -13561,7 +14305,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
 
     // Bitcast the loaded value to a vector of the original element type, in
     // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
+                                    ScalarInVector);
     unsigned SizeRatio = RegSz/MemSz;
 
     // Redistribute the loaded elements into the different locations.
@@ -13593,7 +14338,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
-  // If we are saving a concatination of two XMM registers, perform two stores.
+  // If we are saving a concatenation of two XMM registers, perform two stores.
   // This is better in Sandy Bridge cause one 256-bit mem op is done via two
   // 128-bit ones. If in the future the cost becomes only one memory access the
   // first version would be better.
@@ -13703,8 +14448,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
-  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
-                     && Subtarget->hasXMMInt();
+  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
+                     && Subtarget->hasSSE2();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
@@ -13722,7 +14467,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       Ld = cast<LoadSDNode>(St->getChain());
     else if (St->getValue().hasOneUse() &&
              ChainVal->getOpcode() == ISD::TokenFactor) {
-      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
         if (ChainVal->getOperand(i).getNode() == LdVal) {
           TokenFactorIndex = i;
           Ld = cast<LoadSDNode>(St->getValue());
@@ -13749,7 +14494,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->getAlignment());
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
       SDValue NewChain = NewLd.getValue(1);
       if (TokenFactorIndex != -1) {
         Ops.push_back(NewChain);
@@ -13770,10 +14516,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                                Ld->getPointerInfo(),
                                Ld->isVolatile(), Ld->isNonTemporal(),
-                               Ld->getAlignment());
+                               Ld->isInvariant(), Ld->getAlignment());
     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                Ld->getPointerInfo().getWithOffset(4),
                                Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(),
                                MinAlign(Ld->getAlignment(), 4));
 
     SDValue NewChain = LoLd.getValue(1);
@@ -13817,7 +14564,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 /// set to A, RHS to B, and the routine returns 'true'.
 /// Note that the binary operation should have the property that if one of the
 /// operands is UNDEF then the result is UNDEF.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   // Look for the following pattern: if
   //   A = < float a0, float a1, float a2, float a3 >
   //   B = < float b0, float b1, float b2, float b3 >
@@ -13833,7 +14580,18 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
     return false;
 
   EVT VT = LHS.getValueType();
-  unsigned N = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
 
   // View LHS in the form
   //   LHS = VECTOR_SHUFFLE A, B, LMask
@@ -13842,34 +14600,36 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   // type VT.
   SDValue A, B;
-  SmallVector<int, 8> LMask(N);
+  SmallVector<int, 16> LMask(NumElts);
   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
       A = LHS.getOperand(0);
     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
       B = LHS.getOperand(1);
-    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
   } else {
     if (LHS.getOpcode() != ISD::UNDEF)
       A = LHS;
-    for (unsigned i = 0; i != N; ++i)
+    for (unsigned i = 0; i != NumElts; ++i)
       LMask[i] = i;
   }
 
   // Likewise, view RHS in the form
   //   RHS = VECTOR_SHUFFLE C, D, RMask
   SDValue C, D;
-  SmallVector<int, 8> RMask(N);
+  SmallVector<int, 16> RMask(NumElts);
   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
       C = RHS.getOperand(0);
     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
       D = RHS.getOperand(1);
-    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
   } else {
     if (RHS.getOpcode() != ISD::UNDEF)
       C = RHS;
-    for (unsigned i = 0; i != N; ++i)
+    for (unsigned i = 0; i != NumElts; ++i)
       RMask[i] = i;
   }
 
@@ -13884,30 +14644,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
   // If A and B occur in reverse order in RHS, then "swap" them (which means
   // rewriting the mask).
   if (A != C)
-    for (unsigned i = 0; i != N; ++i) {
-      unsigned Idx = RMask[i];
-      if (Idx < N)
-        RMask[i] += N;
-      else if (Idx < 2*N)
-        RMask[i] -= N;
-    }
+    CommuteVectorShuffleMask(RMask, NumElts);
 
   // At this point LHS and RHS are equivalent to
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   //   RHS = VECTOR_SHUFFLE A, B, RMask
   // Check that the masks correspond to performing a horizontal operation.
-  for (unsigned i = 0; i != N; ++i) {
-    unsigned LIdx = LMask[i], RIdx = RMask[i];
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int LIdx = LMask[i], RIdx = RMask[i];
 
     // Ignore any UNDEF components.
-    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
-        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+    if (LIdx < 0 || RIdx < 0 ||
+        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
       continue;
 
     // Check that successive elements are being operated on.  If not, this is
     // not a horizontal operation.
-    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
-        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
+    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
+    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+    if (!(LIdx == Index && RIdx == Index + 1) &&
+        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
       return false;
   }
 
@@ -13924,8 +14682,8 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
-      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, true))
     return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -13939,8 +14697,8 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal subs from subs of shuffles.
-  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
-      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, false))
     return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -14006,7 +14764,58 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX()) 
+    return SDValue();
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
+      (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
+
+    unsigned NumElems = OpVT.getVectorNumElements();
+    SmallVector<int,8> ShufMask1(NumElems, -1);
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+                                        ShufMask1.data());
+
+    SmallVector<int,8> ShufMask2(NumElems, -1);
+    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
+                                        ShufMask2.data());
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
+                                  VT.getVectorNumElements()/2);
+
+    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
+    OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+  }
+  return SDValue();
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
@@ -14014,6 +14823,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
   DebugLoc dl = N->getDebugLoc();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
   if (N0.getOpcode() == ISD::AND &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
@@ -14028,6 +14839,37 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
                                    N00.getOperand(0), N00.getOperand(1)),
                        DAG.getConstant(1, VT));
   }
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+  if (Subtarget->hasAVX()) {
+
+    if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
+      ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
+
+      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+      SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
+      SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+
+      EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 
+        VT.getVectorNumElements()/2);
+
+      OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+      OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+    }
+  }
+
 
   return SDValue();
 }
@@ -14136,7 +14978,24 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
 }
 
-static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
@@ -14158,6 +15017,13 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
@@ -14167,19 +15033,20 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
-    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
-  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
-  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
-  case ISD::SUB:            return PerformSubCombine(N, DAG);
+  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
+  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   case ISD::SHL:
   case ISD::SRA:
-  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
@@ -14190,27 +15057,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
+  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
-  case X86ISD::SHUFPS:      // Handle all target specific shuffles
-  case X86ISD::SHUFPD:
+  case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
-  case X86ISD::PUNPCKHBW:
-  case X86ISD::PUNPCKHWD:
-  case X86ISD::PUNPCKHDQ:
-  case X86ISD::PUNPCKHQDQ:
-  case X86ISD::UNPCKHPS:
-  case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
-  case X86ISD::PUNPCKLBW:
-  case X86ISD::PUNPCKLWD:
-  case X86ISD::PUNPCKLDQ:
-  case X86ISD::PUNPCKLQDQ:
-  case X86ISD::UNPCKLPS:
-  case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFD:
@@ -14218,11 +15072,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::VPERMILPS:
-  case X86ISD::VPERMILPSY:
-  case X86ISD::VPERMILPD:
-  case X86ISD::VPERMILPDY:
-  case X86ISD::VPERM2F128:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   }
 
@@ -14330,11 +15181,38 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
+namespace {
+  // Helper to match a string separated by whitespace.
+  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
+    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+    for (unsigned i = 0, e = args.size(); i != e; ++i) {
+      StringRef piece(*args[i]);
+      if (!s.startswith(piece)) // Check if the piece matches.
+        return false;
+
+      s = s.substr(piece.size());
+      StringRef::size_type pos = s.find_first_not_of(" \t");
+      if (pos == 0) // We matched a prefix.
+        return false;
+
+      s = s.substr(pos);
+    }
+
+    return s.empty();
+  }
+  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+}
+
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
   std::string AsmStr = IA->getAsmString();
 
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
@@ -14342,35 +15220,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
-    AsmStr = AsmPieces[0];
-    AsmPieces.clear();
-    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
-
     // FIXME: this should verify that we are targeting a 486 or better.  If not,
-    // we will turn this bswap into something that will be lowered to logical ops
-    // instead of emitting the bswap asm.  For now, we don't support 486 or lower
-    // so don't worry about this.
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
     // bswap $0
-    if (AsmPieces.size() == 2 &&
-        (AsmPieces[0] == "bswap" ||
-         AsmPieces[0] == "bswapq" ||
-         AsmPieces[0] == "bswapl") &&
-        (AsmPieces[1] == "$0" ||
-         AsmPieces[1] == "${0:q}")) {
+    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
+        matchAsm(AsmPieces[0], "bswapl", "$0") ||
+        matchAsm(AsmPieces[0], "bswapq", "$0") ||
+        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
       // No need to check constraints, nothing other than the equivalent of
       // "=r,0" would be valid here.
-      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-      if (!Ty || Ty->getBitWidth() % 16 != 0)
-        return false;
       return IntrinsicLowering::LowerToByteSwap(CI);
     }
+
     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
     if (CI->getType()->isIntegerTy(16) &&
-        AsmPieces.size() == 3 &&
-        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
-        AsmPieces[1] == "$$8," &&
-        AsmPieces[2] == "${0:w}" &&
-        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
+         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
       AsmPieces.clear();
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -14379,46 +15249,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
           AsmPieces[0] == "~{cc}" &&
           AsmPieces[1] == "~{dirflag}" &&
           AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}") {
-        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-        if (!Ty || Ty->getBitWidth() % 16 != 0)
-          return false;
-        return IntrinsicLowering::LowerToByteSwap(CI);
-      }
+          AsmPieces[3] == "~{fpsr}")
+      return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
     if (CI->getType()->isIntegerTy(32) &&
-        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
-      SmallVector<StringRef, 4> Words;
-      SplitString(AsmPieces[0], Words, " \t,");
-      if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
-          Words[2] == "${0:w}") {
-        Words.clear();
-        SplitString(AsmPieces[1], Words, " \t,");
-        if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
-            Words[2] == "$0") {
-          Words.clear();
-          SplitString(AsmPieces[2], Words, " \t,");
-          if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
-              Words[2] == "${0:w}") {
-            AsmPieces.clear();
-            const std::string &ConstraintsStr = IA->getConstraintString();
-            SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-            std::sort(AsmPieces.begin(), AsmPieces.end());
-            if (AsmPieces.size() == 4 &&
-                AsmPieces[0] == "~{cc}" &&
-                AsmPieces[1] == "~{dirflag}" &&
-                AsmPieces[2] == "~{flags}" &&
-                AsmPieces[3] == "~{fpsr}") {
-              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-              if (!Ty || Ty->getBitWidth() % 16 != 0)
-                return false;
-              return IntrinsicLowering::LowerToByteSwap(CI);
-            }
-          }
-        }
-      }
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
+        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
+        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      std::sort(AsmPieces.begin(), AsmPieces.end());
+      if (AsmPieces.size() == 4 &&
+          AsmPieces[0] == "~{cc}" &&
+          AsmPieces[1] == "~{dirflag}" &&
+          AsmPieces[2] == "~{flags}" &&
+          AsmPieces[3] == "~{fpsr}")
+        return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
     if (CI->getType()->isIntegerTy(64)) {
@@ -14427,23 +15277,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
-        SmallVector<StringRef, 4> Words;
-        SplitString(AsmPieces[0], Words, " \t");
-        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
-          Words.clear();
-          SplitString(AsmPieces[1], Words, " \t");
-          if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
-            Words.clear();
-            SplitString(AsmPieces[2], Words, " \t,");
-            if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
-                Words[2] == "%edx") {
-              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-              if (!Ty || Ty->getBitWidth() % 16 != 0)
-                return false;
-              return IntrinsicLowering::LowerToByteSwap(CI);
-            }
-          }
-        }
+        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
+            matchAsm(AsmPieces[1], "bswap", "%edx") &&
+            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+          return IntrinsicLowering::LowerToByteSwap(CI);
       }
     }
     break;
@@ -14538,7 +15375,8 @@ TargetLowering::ConstraintWeight
       break;
   case 'x':
   case 'Y':
-    if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
       weight = CW_Register;
     break;
   case 'I':
@@ -14608,9 +15446,9 @@ LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
-    if (Subtarget->hasXMMInt())
+    if (Subtarget->hasSSE2())
       return "Y";
-    if (Subtarget->hasXMM())
+    if (Subtarget->hasSSE1())
       return "x";
   }
 
@@ -14816,10 +15654,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       if (!Subtarget->hasMMX()) break;
       return std::make_pair(0U, X86::VR64RegisterClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
-      if (!Subtarget->hasXMMInt()) break;
+      if (!Subtarget->hasSSE2()) break;
       // FALL THROUGH.
-    case 'x':   // SSE_REGS if SSE1 allowed
-      if (!Subtarget->hasXMM()) break;
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget->hasSSE1()) break;
 
       switch (VT.getSimpleVT().SimpleTy) {
       default: break;
@@ -14838,6 +15676,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v4f32:
       case MVT::v2f64:
         return std::make_pair(0U, X86::VR128RegisterClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, X86::VR256RegisterClass);
+        
       }
       break;
     }
author	dim <dim@FreeBSD.org>	2012-04-14 13:54:10 +0000
committer	dim <dim@FreeBSD.org>	2012-04-14 13:54:10 +0000
commit	1fc08f5e9ef733ef1ce6f363fecedc2260e78974 (patch)
tree	19c69a04768629f2d440944b71cbe90adae0b615 /lib/Target/X86/X86ISelLowering.cpp
parent	07637c87f826cdf411f0673595e9bc92ebd793f2 (diff)
download	FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.zip FreeBSD-src-1fc08f5e9ef733ef1ce6f363fecedc2260e78974.tar.gz