diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 319 |
1 files changed, 237 insertions, 82 deletions
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index f3268d2..8b930bd 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -981,6 +981,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, bool MadeChange = false; APInt UndefElts2(VWidth, 0); + APInt UndefElts3(VWidth, 0); Value *TmpV; switch (I->getOpcode()) { default: break; @@ -1020,8 +1021,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } case Instruction::ShuffleVector: { ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); - uint64_t LHSVWidth = - cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements(); + unsigned LHSVWidth = + Shuffle->getOperand(0)->getType()->getVectorNumElements(); APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); for (unsigned i = 0; i < VWidth; i++) { if (DemandedElts[i]) { @@ -1037,17 +1038,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } } - APInt UndefElts4(LHSVWidth, 0); + APInt LHSUndefElts(LHSVWidth, 0); TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded, - UndefElts4, Depth + 1); + LHSUndefElts, Depth + 1); if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } - APInt UndefElts3(LHSVWidth, 0); + APInt RHSUndefElts(LHSVWidth, 0); TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded, - UndefElts3, Depth + 1); + RHSUndefElts, Depth + 1); if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } bool NewUndefElts = false; + unsigned LHSIdx = -1u, LHSValIdx = -1u; + unsigned RHSIdx = -1u, RHSValIdx = -1u; + bool LHSUniform = true; + bool RHSUniform = true; for (unsigned i = 0; i < VWidth; i++) { unsigned MaskVal = Shuffle->getMaskValue(i); if (MaskVal == -1u) { @@ -1056,18 +1061,59 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, NewUndefElts = true; UndefElts.setBit(i); } else if (MaskVal < LHSVWidth) { - if (UndefElts4[MaskVal]) { + if (LHSUndefElts[MaskVal]) { NewUndefElts = true; UndefElts.setBit(i); + } else { + LHSIdx = LHSIdx == -1u ? i : LHSVWidth; + LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth; + LHSUniform = LHSUniform && (MaskVal == i); } } else { - if (UndefElts3[MaskVal - LHSVWidth]) { + if (RHSUndefElts[MaskVal - LHSVWidth]) { NewUndefElts = true; UndefElts.setBit(i); + } else { + RHSIdx = RHSIdx == -1u ? i : LHSVWidth; + RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth; + RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i); } } } + // Try to transform shuffle with constant vector and single element from + // this constant vector to single insertelement instruction. + // shufflevector V, C, <v1, v2, .., ci, .., vm> -> + // insertelement V, C[ci], ci-n + if (LHSVWidth == Shuffle->getType()->getNumElements()) { + Value *Op = nullptr; + Constant *Value = nullptr; + unsigned Idx = -1u; + + // Find constant vector with the single element in shuffle (LHS or RHS). + if (LHSIdx < LHSVWidth && RHSUniform) { + if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { + Op = Shuffle->getOperand(1); + Value = CV->getOperand(LHSValIdx); + Idx = LHSIdx; + } + } + if (RHSIdx < LHSVWidth && LHSUniform) { + if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { + Op = Shuffle->getOperand(0); + Value = CV->getOperand(RHSValIdx); + Idx = RHSIdx; + } + } + // Found constant vector with single element - convert to insertelement. + if (Op && Value) { + Instruction *New = InsertElementInst::Create( + Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx), + Shuffle->getName()); + InsertNewInstWith(New, *Shuffle); + return New; + } + } if (NewUndefElts) { // Add additional discovered undefs. SmallVector<Constant*, 16> Elts; @@ -1209,114 +1255,223 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, switch (II->getIntrinsicID()) { default: break; + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: + // The instructions for these intrinsics are speced to zero upper bits not + // pass them through like other scalar intrinsics. So we shouldn't just + // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. + // Instead we should return a zero vector. + if (!DemandedElts[0]) { + Worklist.Add(II); + return ConstantAggregateZero::get(II->getType()); + } + + // Only the lower element is used. + DemandedElts = 1; + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, + UndefElts, Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + + // Only the lower element is undefined. The high elements are zero. + UndefElts = UndefElts[0]; + break; + // Unary scalar-as-vector operations that work column-wise. case Intrinsic::x86_sse_rcp_ss: case Intrinsic::x86_sse_rsqrt_ss: case Intrinsic::x86_sse_sqrt_ss: case Intrinsic::x86_sse2_sqrt_sd: - case Intrinsic::x86_xop_vfrcz_ss: - case Intrinsic::x86_xop_vfrcz_sd: TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } // If lowest element of a scalar op isn't used then use Arg0. - if (DemandedElts.getLoBits(1) != 1) + if (!DemandedElts[0]) { + Worklist.Add(II); return II->getArgOperand(0); + } // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions // checks). break; - // Binary scalar-as-vector operations that work column-wise. A dest element - // is a function of the corresponding input elements from the two inputs. - case Intrinsic::x86_sse_add_ss: - case Intrinsic::x86_sse_sub_ss: - case Intrinsic::x86_sse_mul_ss: - case Intrinsic::x86_sse_div_ss: + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0. The low element is a function of both + // operands. case Intrinsic::x86_sse_min_ss: case Intrinsic::x86_sse_max_ss: case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse2_add_sd: - case Intrinsic::x86_sse2_sub_sd: - case Intrinsic::x86_sse2_mul_sd: - case Intrinsic::x86_sse2_div_sd: case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse2_cmp_sd: - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: + case Intrinsic::x86_sse2_cmp_sd: { TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); + } + + // Only lower element is used for operand 1. + DemandedElts = 1; TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, UndefElts2, Depth + 1); if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } - // If only the low elt is demanded and this is a scalarizable intrinsic, - // scalarize it now. - if (DemandedElts == 1) { - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_add_ss: - case Intrinsic::x86_sse_sub_ss: - case Intrinsic::x86_sse_mul_ss: - case Intrinsic::x86_sse_div_ss: - case Intrinsic::x86_sse2_add_sd: - case Intrinsic::x86_sse2_sub_sd: - case Intrinsic::x86_sse2_mul_sd: - case Intrinsic::x86_sse2_div_sd: - // TODO: Lower MIN/MAX/etc. - Value *LHS = II->getArgOperand(0); - Value *RHS = II->getArgOperand(1); - // Extract the element as scalars. - LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, - ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); - RHS = InsertNewInstWith(ExtractElementInst::Create(RHS, - ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); - - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Case stmts out of sync!"); - case Intrinsic::x86_sse_add_ss: - case Intrinsic::x86_sse2_add_sd: - TmpV = InsertNewInstWith(BinaryOperator::CreateFAdd(LHS, RHS, - II->getName()), *II); - break; - case Intrinsic::x86_sse_sub_ss: - case Intrinsic::x86_sse2_sub_sd: - TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS, - II->getName()), *II); - break; - case Intrinsic::x86_sse_mul_ss: - case Intrinsic::x86_sse2_mul_sd: - TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS, - II->getName()), *II); - break; - case Intrinsic::x86_sse_div_ss: - case Intrinsic::x86_sse2_div_sd: - TmpV = InsertNewInstWith(BinaryOperator::CreateFDiv(LHS, RHS, - II->getName()), *II); - break; - } - - Instruction *New = - InsertElementInst::Create( - UndefValue::get(II->getType()), TmpV, - ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false), - II->getName()); - InsertNewInstWith(New, *II); - return New; - } + // Lower element is undefined if both lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0]) + UndefElts.clearBit(0); + + break; + } + + // Binary scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element comes from operand 1. + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + // Don't use the low element of operand 0. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(0); + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts2, + UndefElts, Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + + // If lowest element of a scalar op isn't used then use Arg0. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(0); } + // Only lower element is used for operand 1. + DemandedElts = 1; + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, + UndefElts2, Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + + // Take the high undef elements from operand 0 and take the lower element + // from operand 1. + UndefElts.clearBit(0); + UndefElts |= UndefElts2[0]; + break; + } + + // Three input scalar-as-vector operations that work column-wise. The high + // elements come from operand 0 and the low element is a function of all + // three inputs. + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + case Intrinsic::x86_fma_vfmadd_ss: + case Intrinsic::x86_fma_vfmsub_ss: + case Intrinsic::x86_fma_vfnmadd_ss: + case Intrinsic::x86_fma_vfnmsub_ss: + case Intrinsic::x86_fma_vfmadd_sd: + case Intrinsic::x86_fma_vfmsub_sd: + case Intrinsic::x86_fma_vfnmadd_sd: + case Intrinsic::x86_fma_vfnmsub_sd: + case Intrinsic::x86_avx512_mask_vfmadd_ss: + case Intrinsic::x86_avx512_mask_vfmadd_sd: + case Intrinsic::x86_avx512_maskz_vfmadd_ss: + case Intrinsic::x86_avx512_maskz_vfmadd_sd: + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, + UndefElts, Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + // If lowest element of a scalar op isn't used then use Arg0. - if (DemandedElts.getLoBits(1) != 1) + if (!DemandedElts[0]) { + Worklist.Add(II); return II->getArgOperand(0); + } + + // Only lower element is used for operand 1 and 2. + DemandedElts = 1; + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, + UndefElts2, Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts, + UndefElts3, Depth + 1); + if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; } + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); - // Output elements are undefined if both are undefined. Consider things - // like undef&0. The result is known zero, not undef. - UndefElts &= UndefElts2; break; + case Intrinsic::x86_avx512_mask3_vfmadd_ss: + case Intrinsic::x86_avx512_mask3_vfmadd_sd: + case Intrinsic::x86_avx512_mask3_vfmsub_ss: + case Intrinsic::x86_avx512_mask3_vfmsub_sd: + case Intrinsic::x86_avx512_mask3_vfnmsub_ss: + case Intrinsic::x86_avx512_mask3_vfnmsub_sd: + // These intrinsics get the passthru bits from operand 2. + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(2), DemandedElts, + UndefElts, Depth + 1); + if (TmpV) { II->setArgOperand(2, TmpV); MadeChange = true; } + + // If lowest element of a scalar op isn't used then use Arg2. + if (!DemandedElts[0]) { + Worklist.Add(II); + return II->getArgOperand(2); + } + + // Only lower element is used for operand 0 and 1. + DemandedElts = 1; + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, + UndefElts2, Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, + UndefElts3, Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + + // Lower element is undefined if all three lower elements are undefined. + // Consider things like undef&0. The result is known zero, not undef. + if (!UndefElts2[0] || !UndefElts3[0]) + UndefElts.clearBit(0); + + break; + + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + case Intrinsic::x86_avx2_pmulu_dq: + case Intrinsic::x86_avx512_pmul_dq_512: + case Intrinsic::x86_avx512_pmulu_dq_512: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned InnerVWidth = Op0->getType()->getVectorNumElements(); + assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); + + APInt InnerDemandedElts(InnerVWidth, 0); + for (unsigned i = 0; i != VWidth; ++i) + if (DemandedElts[i]) + InnerDemandedElts.setBit(i * 2); + + UndefElts2 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2, + Depth + 1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + + UndefElts3 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3, + Depth + 1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + + break; + } + // SSE4A instructions leave the upper 64-bits of the 128-bit result // in an undefined state. case Intrinsic::x86_sse4a_extrq: |