summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp519
1 files changed, 373 insertions, 146 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5715d82..c9924f2 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
return 8;
}
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasAVX512())
return 512;
@@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
return 32;
}
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+ return getRegisterBitWidth(true);
+}
+
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
// If the loop will not be vectorized, don't interleave the loop.
// Let regular unroll to unroll the loop, which saves the overflow
@@ -114,7 +118,7 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty,
+ unsigned Opcode, Type *Ty,
TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo,
@@ -138,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::v2f64, 69 }, // divpd
{ ISD::FADD, MVT::v2f64, 2 }, // addpd
{ ISD::FSUB, MVT::v2f64, 2 }, // subpd
- // v2i64/v4i64 mul is custom lowered as a series of long
- // multiplies(3), shifts(3) and adds(2).
- // slm muldq version throughput is 2
- { ISD::MUL, MVT::v2i64, 11 },
+ // v2i64/v4i64 mul is custom lowered as a series of long:
+ // multiplies(3), shifts(3) and adds(2)
+ // slm muldq version throughput is 2 and addq throughput 4
+ // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+ // 3X4 (addq throughput) = 17
+ { ISD::MUL, MVT::v2i64, 17 },
+ // slm addq\subq throughput is 4
+ { ISD::ADD, MVT::v2i64, 4 },
+ { ISD::SUB, MVT::v2i64, 4 },
};
if (ST->isSLM()) {
@@ -207,6 +216,10 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry AVX512UniformConstCostTable[] = {
+ { ISD::SRA, MVT::v2i64, 1 },
+ { ISD::SRA, MVT::v4i64, 1 },
+ { ISD::SRA, MVT::v8i64, 1 },
+
{ ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
};
@@ -239,35 +252,38 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
- { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
- { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
-
- { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
- { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
- { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
-
- { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
- { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
- { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
- { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
- { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence
- { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
- { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence
- { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
+ { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
+ { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+ { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
- return LT.first * 30;
+ return LT.first * 32;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
- if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
+ // XOP has faster vXi8 shifts.
+ if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+ !ST->hasXOP())
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2UniformCostTable[] = {
@@ -319,6 +335,14 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
+
{ ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
@@ -347,8 +371,12 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i32, 1 },
+
{ ISD::SHL, MVT::v8i64, 1 },
{ ISD::SRL, MVT::v8i64, 1 },
+
+ { ISD::SRA, MVT::v2i64, 1 },
+ { ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
@@ -410,18 +438,18 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRL, MVT::v2i64, 2 },
{ ISD::SRA, MVT::v2i64, 2 },
// 256bit shifts require splitting if AVX2 didn't catch them above.
- { ISD::SHL, MVT::v32i8, 2 },
- { ISD::SRL, MVT::v32i8, 4 },
- { ISD::SRA, MVT::v32i8, 4 },
- { ISD::SHL, MVT::v16i16, 2 },
- { ISD::SRL, MVT::v16i16, 4 },
- { ISD::SRA, MVT::v16i16, 4 },
- { ISD::SHL, MVT::v8i32, 2 },
- { ISD::SRL, MVT::v8i32, 4 },
- { ISD::SRA, MVT::v8i32, 4 },
- { ISD::SHL, MVT::v4i64, 2 },
- { ISD::SRL, MVT::v4i64, 4 },
- { ISD::SRA, MVT::v4i64, 4 },
+ { ISD::SHL, MVT::v32i8, 2+2 },
+ { ISD::SRL, MVT::v32i8, 4+2 },
+ { ISD::SRA, MVT::v32i8, 4+2 },
+ { ISD::SHL, MVT::v16i16, 2+2 },
+ { ISD::SRL, MVT::v16i16, 4+2 },
+ { ISD::SRA, MVT::v16i16, 4+2 },
+ { ISD::SHL, MVT::v8i32, 2+2 },
+ { ISD::SRL, MVT::v8i32, 4+2 },
+ { ISD::SRA, MVT::v8i32, 4+2 },
+ { ISD::SHL, MVT::v4i64, 2+2 },
+ { ISD::SRL, MVT::v4i64, 4+2 },
+ { ISD::SRA, MVT::v4i64, 4+2 },
};
// Look for XOP lowering tricks.
@@ -431,23 +459,28 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v4i64, 2 }, // psllq.
-
- { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
-
- { ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v8i32, 2 }, // psrad.
- { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
- { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
+ { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
+ { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
+ { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
+
+ { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
+ { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
+ { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
+
+ { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
+ { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
+ { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
};
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+ // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+ if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+ return LT.first * 4; // 2*psrad + shuffle.
+
if (const auto *Entry =
CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
@@ -561,28 +594,28 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
- { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend.
-
- { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend.
-
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+ { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+
+ { ISD::MUL, MVT::v4i32, 1 } // pmulld
};
if (ST->hasSSE41())
@@ -592,34 +625,33 @@ int X86TTIImpl::getArithmeticInstrCost(
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
- { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
- { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
-
- { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
-
- { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v8i16, 1 }, // pmullw
- { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
- { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
-
- { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
+ { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
+
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+
+ { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
+ { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
@@ -804,7 +836,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
{ TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
- { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
+ { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
+
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+ // + vpblendvb
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb
+ // + vpblendvb
};
if (ST->hasAVX2())
@@ -861,7 +900,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
- { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por
+
+ { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb
};
if (ST->hasSSSE3())
@@ -886,7 +928,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
{ TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
- { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
+
+ { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd
};
if (ST->hasSSE2())
@@ -906,7 +951,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -1272,7 +1318,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
return BaseT::getCastInstrCost(Opcode, Dst, Src);
}
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -1338,17 +1385,62 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
+unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
+
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF) {
+ ArrayRef<Type *> Tys, FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) {
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
// CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
// CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
// CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+ static const CostTblEntry AVX512CDCostTbl[] = {
+ { ISD::CTLZ, MVT::v8i64, 1 },
+ { ISD::CTLZ, MVT::v16i32, 1 },
+ { ISD::CTLZ, MVT::v32i16, 8 },
+ { ISD::CTLZ, MVT::v64i8, 20 },
+ { ISD::CTLZ, MVT::v4i64, 1 },
+ { ISD::CTLZ, MVT::v8i32, 1 },
+ { ISD::CTLZ, MVT::v16i16, 4 },
+ { ISD::CTLZ, MVT::v32i8, 10 },
+ { ISD::CTLZ, MVT::v2i64, 1 },
+ { ISD::CTLZ, MVT::v4i32, 1 },
+ { ISD::CTLZ, MVT::v8i16, 4 },
+ { ISD::CTLZ, MVT::v16i8, 4 },
+ };
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ { ISD::BITREVERSE, MVT::v8i64, 5 },
+ { ISD::BITREVERSE, MVT::v16i32, 5 },
+ { ISD::BITREVERSE, MVT::v32i16, 5 },
+ { ISD::BITREVERSE, MVT::v64i8, 5 },
+ { ISD::CTLZ, MVT::v8i64, 23 },
+ { ISD::CTLZ, MVT::v16i32, 22 },
+ { ISD::CTLZ, MVT::v32i16, 18 },
+ { ISD::CTLZ, MVT::v64i8, 17 },
+ { ISD::CTPOP, MVT::v8i64, 7 },
+ { ISD::CTPOP, MVT::v16i32, 11 },
+ { ISD::CTPOP, MVT::v32i16, 9 },
+ { ISD::CTPOP, MVT::v64i8, 6 },
+ { ISD::CTTZ, MVT::v8i64, 10 },
+ { ISD::CTTZ, MVT::v16i32, 14 },
+ { ISD::CTTZ, MVT::v32i16, 12 },
+ { ISD::CTTZ, MVT::v64i8, 9 },
+ };
+ static const CostTblEntry AVX512CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v8i64, 36 },
+ { ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::CTLZ, MVT::v8i64, 29 },
+ { ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTPOP, MVT::v8i64, 16 },
+ { ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTTZ, MVT::v8i64, 20 },
+ { ISD::CTTZ, MVT::v16i32, 28 },
+ };
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
{ ISD::BITREVERSE, MVT::v8i32, 4 },
@@ -1391,25 +1483,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};
static const CostTblEntry AVX1CostTbl[] = {
- { ISD::BITREVERSE, MVT::v4i64, 10 },
- { ISD::BITREVERSE, MVT::v8i32, 10 },
- { ISD::BITREVERSE, MVT::v16i16, 10 },
- { ISD::BITREVERSE, MVT::v32i8, 10 },
+ { ISD::BITREVERSE, MVT::v4i64, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v8i32, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
+ { ISD::BITREVERSE, MVT::v32i8, 12 }, // 2 x 128-bit Op + extract/insert
{ ISD::BSWAP, MVT::v4i64, 4 },
{ ISD::BSWAP, MVT::v8i32, 4 },
{ ISD::BSWAP, MVT::v16i16, 4 },
- { ISD::CTLZ, MVT::v4i64, 46 },
- { ISD::CTLZ, MVT::v8i32, 36 },
- { ISD::CTLZ, MVT::v16i16, 28 },
- { ISD::CTLZ, MVT::v32i8, 18 },
- { ISD::CTPOP, MVT::v4i64, 14 },
- { ISD::CTPOP, MVT::v8i32, 22 },
- { ISD::CTPOP, MVT::v16i16, 18 },
- { ISD::CTPOP, MVT::v32i8, 12 },
- { ISD::CTTZ, MVT::v4i64, 20 },
- { ISD::CTTZ, MVT::v8i32, 28 },
- { ISD::CTTZ, MVT::v16i16, 24 },
- { ISD::CTTZ, MVT::v32i8, 18 },
+ { ISD::CTLZ, MVT::v4i64, 48 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v8i32, 38 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTLZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v4i64, 16 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v8i32, 24 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTPOP, MVT::v32i8, 14 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v4i64, 22 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v8i32, 30 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
+ { ISD::CTTZ, MVT::v32i8, 20 }, // 2 x 128-bit Op + extract/insert
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -1418,8 +1510,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/
};
static const CostTblEntry SSE42CostTbl[] = {
- { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
- { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
};
static const CostTblEntry SSSE3CostTbl[] = {
{ ISD::BITREVERSE, MVT::v2i64, 5 },
@@ -1443,6 +1535,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTTZ, MVT::v16i8, 9 }
};
static const CostTblEntry SSE2CostTbl[] = {
+ { ISD::BITREVERSE, MVT::v2i64, 29 },
+ { ISD::BITREVERSE, MVT::v4i32, 27 },
+ { ISD::BITREVERSE, MVT::v8i16, 27 },
+ { ISD::BITREVERSE, MVT::v16i8, 20 },
{ ISD::BSWAP, MVT::v2i64, 7 },
{ ISD::BSWAP, MVT::v4i32, 7 },
{ ISD::BSWAP, MVT::v8i16, 7 },
@@ -1462,8 +1558,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
};
static const CostTblEntry SSE1CostTbl[] = {
- { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
- { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+ { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
+ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+ };
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::BITREVERSE, MVT::i64, 14 }
+ };
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::BITREVERSE, MVT::i32, 14 },
+ { ISD::BITREVERSE, MVT::i16, 14 },
+ { ISD::BITREVERSE, MVT::i8, 11 }
};
unsigned ISD = ISD::DELETED_NODE;
@@ -1495,6 +1599,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
+ if (ST->hasCDI())
+ if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -1523,12 +1639,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
}
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF) {
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+ ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+ return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
}
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -1562,22 +1685,8 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
-int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
- assert (Ty->isVectorTy() && "Can only scalarize vectors");
- int Cost = 0;
-
- for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
- if (Insert)
- Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
- if (Extract)
- Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
- }
-
- return Cost;
-}
-
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace, const Instruction *I) {
// Handle non-power-of-two vectors such as <3 x float>
if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
unsigned NumElem = VTy->getVectorNumElements();
@@ -2128,11 +2237,125 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+ // TODO: We can increase these based on available vector ops.
+ MaxLoadSize = ST->is64Bit() ? 8 : 4;
+ return true;
+}
+
bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.
// As a temporary solution, disable on Atom.
- return !(ST->isAtom() || ST->isSLM());
+ return !(ST->isAtom());
+}
+
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // We currently Support only fully-interleaved groups, with no gaps.
+ // TODO: Support also strided loads (interleaved-groups with gaps).
+ if (Indices.size() && Indices.size() != Factor)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+ // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+ // the VF=2, while v2i128 is an unsupported MVT vector type
+ // (see MachineValueType.h::getVectorVT()).
+ if (!LegalVT.isVector())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ unsigned VF = VecTy->getVectorNumElements() / Factor;
+ Type *ScalarTy = VecTy->getVectorElementType();
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ VectorType *VT = VectorType::get(ScalarTy, VF);
+ EVT ETy = TLI->getValueType(DL, VT);
+ if (!ETy.isSimple())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // TODO: Complete for other data-types and strides.
+ // Each combination of Stride, ElementTy and VF results in a different
+ // sequence; The cost tables are therefore accessed with:
+ // Factor (stride) and VectorType=VFxElemType.
+ // The Cost accounts only for the shuffle sequence;
+ // The cost of the loads/stores is accounted for separately.
+ //
+ static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
+ { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
+ { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
+ { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+
+ { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
+ { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
+ { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
+ { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
+ { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ };
+
+ static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
+ { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
+ { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
+ { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+ { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
+ { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
+ { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)
+ };
+
+ if (Opcode == Instruction::Load) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ } else {
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
}
// Get estimation for interleaved load/store operations and strided load.
@@ -2243,6 +2466,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
+ if (ST->hasAVX2())
+ return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
OpenPOWER on IntegriCloud