1 files changed, 373 insertions, 146 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5715d82..c9924f2 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
   return 8;
 }
 
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
     if (ST->hasAVX512())
       return 512;
@@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   return 32;
 }
 
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+  return getRegisterBitWidth(true);
+}
+
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
@@ -114,7 +118,7 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty,  
+    unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
@@ -138,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::FDIV, MVT::v2f64, 69 }, // divpd
     { ISD::FADD, MVT::v2f64, 2  }, // addpd
     { ISD::FSUB, MVT::v2f64, 2  }, // subpd
-    // v2i64/v4i64 mul is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    // slm muldq version throughput is 2
-    { ISD::MUL,  MVT::v2i64, 11 },
+    // v2i64/v4i64 mul is custom lowered as a series of long:
+    // multiplies(3), shifts(3) and adds(2)
+    // slm muldq version throughput is 2 and addq throughput 4 
+    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+    //       3X4 (addq throughput) = 17 
+    { ISD::MUL,  MVT::v2i64, 17 },
+    // slm addq\subq throughput is 4
+    { ISD::ADD,  MVT::v2i64, 4  },
+    { ISD::SUB,  MVT::v2i64, 4  },
   };
 
   if (ST->isSLM()) {
@@ -207,6 +216,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX512UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v2i64,   1 },
+    { ISD::SRA,  MVT::v4i64,   1 },
+    { ISD::SRA,  MVT::v8i64,   1 },
+
     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
   };
@@ -239,35 +252,38 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2UniformConstCostTable[] = {
-    { ISD::SHL,  MVT::v16i8,   2 }, // psllw + pand.
-    { ISD::SRL,  MVT::v16i8,   2 }, // psrlw + pand.
-    { ISD::SRA,  MVT::v16i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SHL,  MVT::v32i8,   4 }, // 2*(psllw + pand).
-    { ISD::SRL,  MVT::v32i8,   4 }, // 2*(psrlw + pand).
-    { ISD::SRA,  MVT::v32i8,   8 }, // 2*(psrlw, pand, pxor, psubb).
-
-    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
-    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
-    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
-    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
+    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
+    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
+    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
+    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
-      return LT.first * 30;
+      return LT.first * 32;
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
 
-    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
+    // XOP has faster vXi8 shifts.
+    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+        !ST->hasXOP())
+      if (const auto *Entry =
+              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+        return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
@@ -319,6 +335,14 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWCostTable[] = {
+    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
+    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
+
+    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
+
     { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
@@ -347,8 +371,12 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,     MVT::v16i32,     1 },
     { ISD::SRL,     MVT::v16i32,     1 },
     { ISD::SRA,     MVT::v16i32,     1 },
+
     { ISD::SHL,     MVT::v8i64,      1 },
     { ISD::SRL,     MVT::v8i64,      1 },
+
+    { ISD::SRA,     MVT::v2i64,      1 },
+    { ISD::SRA,     MVT::v4i64,      1 },
     { ISD::SRA,     MVT::v8i64,      1 },
 
     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
@@ -410,18 +438,18 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,     MVT::v2i64,    2 },
     { ISD::SRA,     MVT::v2i64,    2 },
     // 256bit shifts require splitting if AVX2 didn't catch them above.
-    { ISD::SHL,     MVT::v32i8,    2 },
-    { ISD::SRL,     MVT::v32i8,    4 },
-    { ISD::SRA,     MVT::v32i8,    4 },
-    { ISD::SHL,     MVT::v16i16,   2 },
-    { ISD::SRL,     MVT::v16i16,   4 },
-    { ISD::SRA,     MVT::v16i16,   4 },
-    { ISD::SHL,     MVT::v8i32,    2 },
-    { ISD::SRL,     MVT::v8i32,    4 },
-    { ISD::SRA,     MVT::v8i32,    4 },
-    { ISD::SHL,     MVT::v4i64,    2 },
-    { ISD::SRL,     MVT::v4i64,    4 },
-    { ISD::SRA,     MVT::v4i64,    4 },
+    { ISD::SHL,     MVT::v32i8,  2+2 },
+    { ISD::SRL,     MVT::v32i8,  4+2 },
+    { ISD::SRA,     MVT::v32i8,  4+2 },
+    { ISD::SHL,     MVT::v16i16, 2+2 },
+    { ISD::SRL,     MVT::v16i16, 4+2 },
+    { ISD::SRA,     MVT::v16i16, 4+2 },
+    { ISD::SHL,     MVT::v8i32,  2+2 },
+    { ISD::SRL,     MVT::v8i32,  4+2 },
+    { ISD::SRA,     MVT::v8i32,  4+2 },
+    { ISD::SHL,     MVT::v4i64,  2+2 },
+    { ISD::SRL,     MVT::v4i64,  4+2 },
+    { ISD::SRA,     MVT::v4i64,  4+2 },
   };
 
   // Look for XOP lowering tricks.
@@ -431,23 +459,28 @@ int X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
-    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
-    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
-    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
-
-    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
-    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
-    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
-
-    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
-    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
-    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
-    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
+    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
+    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
+
+    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
+    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
+    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
+
+    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
+    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
+    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
   };
 
   if (ST->hasSSE2() &&
       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+    if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+      return LT.first * 4; // 2*psrad + shuffle.
+
     if (const auto *Entry =
             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
@@ -561,28 +594,28 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41CostTable[] = {
-    { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v4i32,     4 }, // pslld/paddd/cvttps2dq/pmulld
-    { ISD::SHL,  MVT::v8i32,   2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
-    { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v4i32,    11 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*11 }, // Shift each lane + blend.
-
-    { ISD::SRA,  MVT::v16i8,    24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
-
-    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
+    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
+    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
+
+    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
+
+    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
   };
 
   if (ST->hasSSE41())
@@ -592,34 +625,33 @@ int X86TTIImpl::getArithmeticInstrCost(
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
-    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
-
-    { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v8i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
-    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,      38 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,    69 }, // Pentium IV from http://www.agner.org/
+    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
+
+    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
+    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
 
     // It is not a good idea to vectorize division. We have to scalarize it and
     // in the process we will often end up having to spilling regular
@@ -804,7 +836,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
 
     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
   };
 
   if (ST->hasAVX2())
@@ -861,7 +900,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
   };
 
   if (ST->hasSSSE3())
@@ -886,7 +928,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
   };
 
   if (ST->hasSSE2())
@@ -906,7 +951,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -1272,7 +1318,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1338,17 +1385,62 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
+unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
+
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                      unsigned ScalarizationCostPassed) {
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+  static const CostTblEntry AVX512CDCostTbl[] = {
+    { ISD::CTLZ,       MVT::v8i64,   1 },
+    { ISD::CTLZ,       MVT::v16i32,  1 },
+    { ISD::CTLZ,       MVT::v32i16,  8 },
+    { ISD::CTLZ,       MVT::v64i8,  20 },
+    { ISD::CTLZ,       MVT::v4i64,   1 },
+    { ISD::CTLZ,       MVT::v8i32,   1 },
+    { ISD::CTLZ,       MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v32i8,  10 },
+    { ISD::CTLZ,       MVT::v2i64,   1 },
+    { ISD::CTLZ,       MVT::v4i32,   1 },
+    { ISD::CTLZ,       MVT::v8i16,   4 },
+    { ISD::CTLZ,       MVT::v16i8,   4 },
+  };
+  static const CostTblEntry AVX512BWCostTbl[] = {
+    { ISD::BITREVERSE, MVT::v8i64,   5 },
+    { ISD::BITREVERSE, MVT::v16i32,  5 },
+    { ISD::BITREVERSE, MVT::v32i16,  5 },
+    { ISD::BITREVERSE, MVT::v64i8,   5 },
+    { ISD::CTLZ,       MVT::v8i64,  23 },
+    { ISD::CTLZ,       MVT::v16i32, 22 },
+    { ISD::CTLZ,       MVT::v32i16, 18 },
+    { ISD::CTLZ,       MVT::v64i8,  17 },
+    { ISD::CTPOP,      MVT::v8i64,   7 },
+    { ISD::CTPOP,      MVT::v16i32, 11 },
+    { ISD::CTPOP,      MVT::v32i16,  9 },
+    { ISD::CTPOP,      MVT::v64i8,   6 },
+    { ISD::CTTZ,       MVT::v8i64,  10 },
+    { ISD::CTTZ,       MVT::v16i32, 14 },
+    { ISD::CTTZ,       MVT::v32i16, 12 },
+    { ISD::CTTZ,       MVT::v64i8,   9 },
+  };
+  static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v8i64,  36 },
+    { ISD::BITREVERSE, MVT::v16i32, 24 },
+    { ISD::CTLZ,       MVT::v8i64,  29 },
+    { ISD::CTLZ,       MVT::v16i32, 35 },
+    { ISD::CTPOP,      MVT::v8i64,  16 },
+    { ISD::CTPOP,      MVT::v16i32, 24 },
+    { ISD::CTTZ,       MVT::v8i64,  20 },
+    { ISD::CTTZ,       MVT::v16i32, 28 },
+  };
   static const CostTblEntry XOPCostTbl[] = {
     { ISD::BITREVERSE, MVT::v4i64,   4 },
     { ISD::BITREVERSE, MVT::v8i32,   4 },
@@ -1391,25 +1483,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
   };
   static const CostTblEntry AVX1CostTbl[] = {
-    { ISD::BITREVERSE, MVT::v4i64,  10 },
-    { ISD::BITREVERSE, MVT::v8i32,  10 },
-    { ISD::BITREVERSE, MVT::v16i16, 10 },
-    { ISD::BITREVERSE, MVT::v32i8,  10 },
+    { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
     { ISD::BSWAP,      MVT::v4i64,   4 },
     { ISD::BSWAP,      MVT::v8i32,   4 },
     { ISD::BSWAP,      MVT::v16i16,  4 },
-    { ISD::CTLZ,       MVT::v4i64,  46 },
-    { ISD::CTLZ,       MVT::v8i32,  36 },
-    { ISD::CTLZ,       MVT::v16i16, 28 },
-    { ISD::CTLZ,       MVT::v32i8,  18 },
-    { ISD::CTPOP,      MVT::v4i64,  14 },
-    { ISD::CTPOP,      MVT::v8i32,  22 },
-    { ISD::CTPOP,      MVT::v16i16, 18 },
-    { ISD::CTPOP,      MVT::v32i8,  12 },
-    { ISD::CTTZ,       MVT::v4i64,  20 },
-    { ISD::CTTZ,       MVT::v8i32,  28 },
-    { ISD::CTTZ,       MVT::v16i16, 24 },
-    { ISD::CTTZ,       MVT::v32i8,  18 },
+    { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
@@ -1418,8 +1510,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
   static const CostTblEntry SSE42CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   18 }, // Nehalem from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSSE3CostTbl[] = {
     { ISD::BITREVERSE, MVT::v2i64,   5 },
@@ -1443,6 +1535,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::CTTZ,       MVT::v16i8,   9 }
   };
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,  29 },
+    { ISD::BITREVERSE, MVT::v4i32,  27 },
+    { ISD::BITREVERSE, MVT::v8i16,  27 },
+    { ISD::BITREVERSE, MVT::v16i8,  20 },
     { ISD::BSWAP,      MVT::v2i64,   7 },
     { ISD::BSWAP,      MVT::v4i32,   7 },
     { ISD::BSWAP,      MVT::v8i16,   7 },
@@ -1462,8 +1558,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSE1CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   28 }, // Pentium III from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
+  };
+  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::BITREVERSE, MVT::i64,    14 }
+  };
+  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::BITREVERSE, MVT::i32,    14 },
+    { ISD::BITREVERSE, MVT::i16,    14 },
+    { ISD::BITREVERSE, MVT::i8,     11 }
   };
 
   unsigned ISD = ISD::DELETED_NODE;
@@ -1495,6 +1599,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   MVT MTy = LT.second;
 
   // Attempt to lookup cost.
+  if (ST->hasCDI())
+    if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasXOP())
     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
@@ -1523,12 +1639,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+  if (ST->is64Bit())
+    if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -1562,22 +1685,8 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
-int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  int Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
@@ -2128,11 +2237,125 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
   // As a temporary solution, disable on Atom.
-  return !(ST->isAtom() || ST->isSLM());
+  return !(ST->isAtom());
+}
+
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+
+  // We currently Support only fully-interleaved groups, with no gaps.
+  // TODO: Support also strided loads (interleaved-groups with gaps).
+  if (Indices.size() && Indices.size() != Factor)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  Type *ScalarTy = VecTy->getVectorElementType();
+  
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+  
+  VectorType *VT = VectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, ElementTy and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxElemType.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
+    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
+    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
+    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
+    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+    
+    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
+    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
+    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
+    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
+    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
+    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
+    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
+    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
+    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
+    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this  point");
+    if (const auto *Entry = 
+            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
 }
 
 // Get estimation for interleaved load/store operations and strided load.
@@ -2243,6 +2466,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace);
+  if (ST->hasAVX2())
+    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+                                          Alignment, AddressSpace);
+  
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }