diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 810 |
1 files changed, 634 insertions, 176 deletions
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8acff91..2ef82ba 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -12,17 +12,47 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/Dominators.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Statepoint.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <vector> + using namespace llvm; using namespace PatternMatch; @@ -79,8 +109,8 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) { } Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { - unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, AC, DT); - unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, AC, DT); + unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT); + unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT); unsigned MinAlign = std::min(DstAlign, SrcAlign); unsigned CopyAlign = MI->getAlignment(); @@ -162,10 +192,17 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { L->setAlignment(SrcAlign); if (CopyMD) L->setMetadata(LLVMContext::MD_tbaa, CopyMD); + MDNode *LoopMemParallelMD = + MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access); + if (LoopMemParallelMD) + L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); + StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile()); S->setAlignment(DstAlign); if (CopyMD) S->setMetadata(LLVMContext::MD_tbaa, CopyMD); + if (LoopMemParallelMD) + S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD); // Set the size of the copy to 0, it will be deleted on the next iteration. MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); @@ -173,7 +210,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { } Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, AC, DT); + unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); if (MI->getAlignment() < Alignment) { MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Alignment, false)); @@ -221,8 +258,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, bool ShiftLeft = false; switch (II.getIntrinsicID()) { - default: - return nullptr; + default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse2_psra_d: case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psrai_d: @@ -231,6 +267,16 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: LogicalShift = false; ShiftLeft = false; break; case Intrinsic::x86_sse2_psrl_d: @@ -245,6 +291,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: LogicalShift = true; ShiftLeft = false; break; case Intrinsic::x86_sse2_psll_d: @@ -259,6 +311,12 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: LogicalShift = true; ShiftLeft = true; break; } @@ -334,10 +392,16 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, bool ShiftLeft = false; switch (II.getIntrinsicID()) { - default: - return nullptr; + default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_avx2_psrav_d: case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: LogicalShift = false; ShiftLeft = false; break; @@ -345,6 +409,11 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psrlv_d_256: case Intrinsic::x86_avx2_psrlv_q: case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: LogicalShift = true; ShiftLeft = false; break; @@ -352,6 +421,11 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, case Intrinsic::x86_avx2_psllv_d_256: case Intrinsic::x86_avx2_psllv_q: case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: LogicalShift = true; ShiftLeft = true; break; @@ -400,7 +474,7 @@ static Value *simplifyX86varShift(const IntrinsicInst &II, // If all elements out of range or UNDEF, return vector of zeros/undefs. // ArithmeticShift should only hit this if they are all UNDEF. auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; - if (llvm::all_of(ShiftAmts, OutOfRange)) { + if (all_of(ShiftAmts, OutOfRange)) { SmallVector<Constant *, 8> ConstantVec; for (int Idx : ShiftAmts) { if (Idx < 0) { @@ -547,7 +621,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, // See if we're dealing with constant values. Constant *C0 = dyn_cast<Constant>(Op0); ConstantInt *CI0 = - C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) : nullptr; // Attempt to constant fold. @@ -630,7 +704,6 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, APInt APLength, APInt APIndex, InstCombiner::BuilderTy &Builder) { - // From AMD documentation: "The bit index and field length are each six bits // in length other bits of the field are ignored." APIndex = APIndex.zextOrTrunc(6); @@ -686,10 +759,10 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, Constant *C0 = dyn_cast<Constant>(Op0); Constant *C1 = dyn_cast<Constant>(Op1); ConstantInt *CI00 = - C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) : nullptr; ConstantInt *CI10 = - C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) : nullptr; // Constant Fold - insert bottom Length bits starting at the Index'th bit. @@ -732,11 +805,11 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II, auto *VecTy = cast<VectorType>(II.getType()); auto *MaskEltTy = Type::getInt32Ty(II.getContext()); unsigned NumElts = VecTy->getNumElements(); - assert((NumElts == 16 || NumElts == 32) && + assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && "Unexpected number of elements in shuffle mask!"); // Construct a shuffle mask from constant integers or UNDEFs. - Constant *Indexes[32] = {NULL}; + Constant *Indexes[64] = {nullptr}; // Each byte in the shuffle control mask forms an index to permute the // corresponding byte in the destination operand. @@ -776,12 +849,15 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II, if (!V) return nullptr; + auto *VecTy = cast<VectorType>(II.getType()); auto *MaskEltTy = Type::getInt32Ty(II.getContext()); - unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); - assert(NumElts == 8 || NumElts == 4 || NumElts == 2); + unsigned NumElts = VecTy->getVectorNumElements(); + bool IsPD = VecTy->getScalarType()->isDoubleTy(); + unsigned NumLaneElts = IsPD ? 2 : 4; + assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); // Construct a shuffle mask from constant integers or UNDEFs. - Constant *Indexes[8] = {NULL}; + Constant *Indexes[16] = {nullptr}; // The intrinsics only read one or two bits, clear the rest. for (unsigned I = 0; I < NumElts; ++I) { @@ -799,18 +875,13 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II, // The PD variants uses bit 1 to select per-lane element index, so // shift down to convert to generic shuffle mask index. - if (II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd || - II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) + if (IsPD) Index = Index.lshr(1); // The _256 variants are a bit trickier since the mask bits always index // into the corresponding 128 half. In order to convert to a generic // shuffle, we have to make that explicit. - if ((II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 || - II.getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) && - ((NumElts / 2) <= I)) { - Index += APInt(32, NumElts / 2); - } + Index += APInt(32, (I / NumLaneElts) * NumLaneElts); Indexes[I] = ConstantInt::get(MaskEltTy, Index); } @@ -831,10 +902,11 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II, auto *VecTy = cast<VectorType>(II.getType()); auto *MaskEltTy = Type::getInt32Ty(II.getContext()); unsigned Size = VecTy->getNumElements(); - assert(Size == 8 && "Unexpected shuffle mask size"); + assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && + "Unexpected shuffle mask size"); // Construct a shuffle mask from constant integers or UNDEFs. - Constant *Indexes[8] = {NULL}; + Constant *Indexes[64] = {nullptr}; for (unsigned I = 0; I < Size; ++I) { Constant *COp = V->getAggregateElement(I); @@ -846,8 +918,8 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II, continue; } - APInt Index = cast<ConstantInt>(COp)->getValue(); - Index = Index.zextOrTrunc(32).getLoBits(3); + uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); + Index &= Size - 1; Indexes[I] = ConstantInt::get(MaskEltTy, Index); } @@ -962,6 +1034,36 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II, return nullptr; } +// Emit a select instruction and appropriate bitcasts to help simplify +// masked intrinsics. +static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1, + InstCombiner::BuilderTy &Builder) { + unsigned VWidth = Op0->getType()->getVectorNumElements(); + + // If the mask is all ones we don't need the select. But we need to check + // only the bit thats will be used in case VWidth is less than 8. + if (auto *C = dyn_cast<ConstantInt>(Mask)) + if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue()) + return Op0; + + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Mask = Builder.CreateBitCast(Mask, MaskTy); + + // If we have less than 8 elements, then the starting mask was an i8 and + // we need to extract down to the right number of elements. + if (VWidth < 8) { + uint32_t Indices[4]; + for (unsigned i = 0; i != VWidth; ++i) + Indices[i] = i; + Mask = Builder.CreateShuffleVector(Mask, Mask, + makeArrayRef(Indices, VWidth), + "extract"); + } + + return Builder.CreateSelect(Mask, Op0, Op1); +} + static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) { Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); @@ -1104,6 +1206,50 @@ static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) { return nullptr; } +static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { + assert((II.getIntrinsicID() == Intrinsic::cttz || + II.getIntrinsicID() == Intrinsic::ctlz) && + "Expected cttz or ctlz intrinsic"); + Value *Op0 = II.getArgOperand(0); + // FIXME: Try to simplify vectors of integers. + auto *IT = dyn_cast<IntegerType>(Op0->getType()); + if (!IT) + return nullptr; + + unsigned BitWidth = IT->getBitWidth(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + IC.computeKnownBits(Op0, KnownZero, KnownOne, 0, &II); + + // Create a mask for bits above (ctlz) or below (cttz) the first known one. + bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; + unsigned NumMaskBits = IsTZ ? KnownOne.countTrailingZeros() + : KnownOne.countLeadingZeros(); + APInt Mask = IsTZ ? APInt::getLowBitsSet(BitWidth, NumMaskBits) + : APInt::getHighBitsSet(BitWidth, NumMaskBits); + + // If all bits above (ctlz) or below (cttz) the first known one are known + // zero, this value is constant. + // FIXME: This should be in InstSimplify because we're replacing an + // instruction with a constant. + if ((Mask & KnownZero) == Mask) { + auto *C = ConstantInt::get(IT, APInt(BitWidth, NumMaskBits)); + return IC.replaceInstUsesWith(II, C); + } + + // If the input to cttz/ctlz is known to be non-zero, + // then change the 'ZeroIsUndef' parameter to 'true' + // because we know the zero behavior can't affect the result. + if (KnownOne != 0 || isKnownNonZero(Op0, IC.getDataLayout())) { + if (!match(II.getArgOperand(1), m_One())) { + II.setOperand(1, IC.Builder->getTrue()); + return &II; + } + } + + return nullptr; +} + // TODO: If the x86 backend knew how to convert a bool vector mask back to an // XMM register mask efficiently, we could transform all x86 masked intrinsics // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. @@ -1243,16 +1389,15 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) { Instruction *InstCombiner::visitCallInst(CallInst &CI) { auto Args = CI.arg_operands(); if (Value *V = SimplifyCall(CI.getCalledValue(), Args.begin(), Args.end(), DL, - TLI, DT, AC)) + &TLI, &DT, &AC)) return replaceInstUsesWith(CI, V); - if (isFreeCall(&CI, TLI)) + if (isFreeCall(&CI, &TLI)) return visitFree(CI); // If the caller function is nounwind, mark the call as nounwind, even if the // callee isn't. - if (CI.getParent()->getParent()->doesNotThrow() && - !CI.doesNotThrow()) { + if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) { CI.setDoesNotThrow(); return &CI; } @@ -1323,26 +1468,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); }; - auto SimplifyDemandedVectorEltsHigh = [this](Value *Op, unsigned Width, - unsigned DemandedWidth) { - APInt UndefElts(Width, 0); - APInt DemandedElts = APInt::getHighBitsSet(Width, DemandedWidth); - return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); - }; switch (II->getIntrinsicID()) { default: break; - case Intrinsic::objectsize: { - uint64_t Size; - if (getObjectSize(II->getArgOperand(0), Size, DL, TLI)) { - APInt APSize(II->getType()->getIntegerBitWidth(), Size); - // Equality check to be sure that `Size` can fit in a value of type - // `II->getType()` - if (APSize == Size) - return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), APSize)); - } + case Intrinsic::objectsize: + if (ConstantInt *N = + lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false)) + return replaceInstUsesWith(CI, N); return nullptr; - } + case Intrinsic::bswap: { Value *IIOperand = II->getArgOperand(0); Value *X = nullptr; @@ -1397,41 +1531,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { II->getArgOperand(0)); } break; - case Intrinsic::cttz: { - // If all bits below the first known one are known zero, - // this value is constant. - IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType()); - // FIXME: Try to simplify vectors of integers. - if (!IT) break; - uint32_t BitWidth = IT->getBitWidth(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II); - unsigned TrailingZeros = KnownOne.countTrailingZeros(); - APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros)); - if ((Mask & KnownZero) == Mask) - return replaceInstUsesWith(CI, ConstantInt::get(IT, - APInt(BitWidth, TrailingZeros))); - - } - break; - case Intrinsic::ctlz: { - // If all bits above the first known one are known zero, - // this value is constant. - IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType()); - // FIXME: Try to simplify vectors of integers. - if (!IT) break; - uint32_t BitWidth = IT->getBitWidth(); - APInt KnownZero(BitWidth, 0); - APInt KnownOne(BitWidth, 0); - computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II); - unsigned LeadingZeros = KnownOne.countLeadingZeros(); - APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros)); - if ((Mask & KnownZero) == Mask) - return replaceInstUsesWith(CI, ConstantInt::get(IT, - APInt(BitWidth, LeadingZeros))); - } + case Intrinsic::cttz: + case Intrinsic::ctlz: + if (auto *I = foldCttzCtlz(*II, *this)) + return I; break; case Intrinsic::uadd_with_overflow: @@ -1446,7 +1550,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { II->setArgOperand(1, LHS); return II; } - // fall through + LLVM_FALLTHROUGH; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: { @@ -1477,11 +1581,77 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; } + case Intrinsic::fma: + case Intrinsic::fmuladd: { + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + + // Canonicalize constants into the RHS. + if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { + II->setArgOperand(0, Src1); + II->setArgOperand(1, Src0); + std::swap(Src0, Src1); + } + + Value *LHS = nullptr; + Value *RHS = nullptr; + + // fma fneg(x), fneg(y), z -> fma x, y, z + if (match(Src0, m_FNeg(m_Value(LHS))) && + match(Src1, m_FNeg(m_Value(RHS)))) { + II->setArgOperand(0, LHS); + II->setArgOperand(1, RHS); + return II; + } + + // fma fabs(x), fabs(x), z -> fma x, x, z + if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) && + match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) { + II->setArgOperand(0, LHS); + II->setArgOperand(1, RHS); + return II; + } + + // fma x, 1, z -> fadd x, z + if (match(Src1, m_FPOne())) { + Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2)); + RI->copyFastMathFlags(II); + return RI; + } + + break; + } + case Intrinsic::fabs: { + Value *Cond; + Constant *LHS, *RHS; + if (match(II->getArgOperand(0), + m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) { + CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS}); + CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS}); + return SelectInst::Create(Cond, Call0, Call1); + } + + break; + } + case Intrinsic::cos: + case Intrinsic::amdgcn_cos: { + Value *SrcSrc; + Value *Src = II->getArgOperand(0); + if (match(Src, m_FNeg(m_Value(SrcSrc))) || + match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) { + // cos(-x) -> cos(x) + // cos(fabs(x)) -> cos(x) + II->setArgOperand(0, SrcSrc); + return II; + } + + break; + } case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: // Turn PPC lvx -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= - 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, + &DT) >= 16) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); return new LoadInst(Ptr); @@ -1497,8 +1667,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >= - 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, + &DT) >= 16) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); @@ -1514,8 +1684,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } case Intrinsic::ppc_qpx_qvlfs: // Turn PPC QPX qvlfs -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >= - 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC, + &DT) >= 16) { Type *VTy = VectorType::get(Builder->getFloatTy(), II->getType()->getVectorNumElements()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), @@ -1526,8 +1696,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::ppc_qpx_qvlfd: // Turn PPC QPX qvlfd -> load if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >= - 32) { + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC, + &DT) >= 32) { Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); return new LoadInst(Ptr); @@ -1535,8 +1705,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::ppc_qpx_qvstfs: // Turn PPC QPX qvstfs -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >= - 16) { + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC, + &DT) >= 16) { Type *VTy = VectorType::get(Builder->getFloatTy(), II->getArgOperand(0)->getType()->getVectorNumElements()); Value *TOp = Builder->CreateFPTrunc(II->getArgOperand(0), VTy); @@ -1547,8 +1717,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; case Intrinsic::ppc_qpx_qvstfd: // Turn PPC QPX qvstfd -> store if the pointer is known aligned. - if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, AC, DT) >= - 32) { + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC, + &DT) >= 32) { Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); @@ -1607,7 +1777,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_cvtsd2si: case Intrinsic::x86_sse2_cvtsd2si64: case Intrinsic::x86_sse2_cvttsd2si: - case Intrinsic::x86_sse2_cvttsd2si64: { + case Intrinsic::x86_sse2_cvttsd2si64: + case Intrinsic::x86_avx512_vcvtss2si32: + case Intrinsic::x86_avx512_vcvtss2si64: + case Intrinsic::x86_avx512_vcvtss2usi32: + case Intrinsic::x86_avx512_vcvtss2usi64: + case Intrinsic::x86_avx512_vcvtsd2si32: + case Intrinsic::x86_avx512_vcvtsd2si64: + case Intrinsic::x86_avx512_vcvtsd2usi32: + case Intrinsic::x86_avx512_vcvtsd2usi64: + case Intrinsic::x86_avx512_cvttss2si: + case Intrinsic::x86_avx512_cvttss2si64: + case Intrinsic::x86_avx512_cvttss2usi: + case Intrinsic::x86_avx512_cvttss2usi64: + case Intrinsic::x86_avx512_cvttsd2si: + case Intrinsic::x86_avx512_cvttsd2si64: + case Intrinsic::x86_avx512_cvttsd2usi: + case Intrinsic::x86_avx512_cvttsd2usi64: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. Value *Arg = II->getArgOperand(0); @@ -1654,7 +1840,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_ucomigt_sd: case Intrinsic::x86_sse2_ucomile_sd: case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomineq_sd: { + case Intrinsic::x86_sse2_ucomineq_sd: + case Intrinsic::x86_avx512_vcomi_ss: + case Intrinsic::x86_avx512_vcomi_sd: + case Intrinsic::x86_avx512_mask_cmp_ss: + case Intrinsic::x86_avx512_mask_cmp_sd: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. bool MadeChange = false; @@ -1674,50 +1864,155 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } - case Intrinsic::x86_sse_add_ss: - case Intrinsic::x86_sse_sub_ss: - case Intrinsic::x86_sse_mul_ss: - case Intrinsic::x86_sse_div_ss: + case Intrinsic::x86_avx512_mask_add_ps_512: + case Intrinsic::x86_avx512_mask_div_ps_512: + case Intrinsic::x86_avx512_mask_mul_ps_512: + case Intrinsic::x86_avx512_mask_sub_ps_512: + case Intrinsic::x86_avx512_mask_add_pd_512: + case Intrinsic::x86_avx512_mask_div_pd_512: + case Intrinsic::x86_avx512_mask_mul_pd_512: + case Intrinsic::x86_avx512_mask_sub_pd_512: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { + if (R->getValue() == 4) { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + Value *V; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ps_512: + case Intrinsic::x86_avx512_mask_add_pd_512: + V = Builder->CreateFAdd(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mask_sub_ps_512: + case Intrinsic::x86_avx512_mask_sub_pd_512: + V = Builder->CreateFSub(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mask_mul_ps_512: + case Intrinsic::x86_avx512_mask_mul_pd_512: + V = Builder->CreateFMul(Arg0, Arg1); + break; + case Intrinsic::x86_avx512_mask_div_ps_512: + case Intrinsic::x86_avx512_mask_div_pd_512: + V = Builder->CreateFDiv(Arg0, Arg1); + break; + } + + // Create a select for the masking. + V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), + *Builder); + return replaceInstUsesWith(*II, V); + } + } + break; + + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular + // IR operations. + if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) { + if (R->getValue() == 4) { + // Extract the element as scalars. + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + Value *LHS = Builder->CreateExtractElement(Arg0, (uint64_t)0); + Value *RHS = Builder->CreateExtractElement(Arg1, (uint64_t)0); + + Value *V; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_avx512_mask_add_ss_round: + case Intrinsic::x86_avx512_mask_add_sd_round: + V = Builder->CreateFAdd(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_sub_ss_round: + case Intrinsic::x86_avx512_mask_sub_sd_round: + V = Builder->CreateFSub(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_mul_ss_round: + case Intrinsic::x86_avx512_mask_mul_sd_round: + V = Builder->CreateFMul(LHS, RHS); + break; + case Intrinsic::x86_avx512_mask_div_ss_round: + case Intrinsic::x86_avx512_mask_div_sd_round: + V = Builder->CreateFDiv(LHS, RHS); + break; + } + + // Handle the masking aspect of the intrinsic. + Value *Mask = II->getArgOperand(3); + auto *C = dyn_cast<ConstantInt>(Mask); + // We don't need a select if we know the mask bit is a 1. + if (!C || !C->getValue()[0]) { + // Cast the mask to an i1 vector and then extract the lowest element. + auto *MaskTy = VectorType::get(Builder->getInt1Ty(), + cast<IntegerType>(Mask->getType())->getBitWidth()); + Mask = Builder->CreateBitCast(Mask, MaskTy); + Mask = Builder->CreateExtractElement(Mask, (uint64_t)0); + // Extract the lowest element from the passthru operand. + Value *Passthru = Builder->CreateExtractElement(II->getArgOperand(2), + (uint64_t)0); + V = Builder->CreateSelect(Mask, V, Passthru); + } + + // Insert the result back into the original argument 0. + V = Builder->CreateInsertElement(Arg0, V, (uint64_t)0); + + return replaceInstUsesWith(*II, V); + } + } + LLVM_FALLTHROUGH; + + // X86 scalar intrinsics simplified with SimplifyDemandedVectorElts. + case Intrinsic::x86_avx512_mask_max_ss_round: + case Intrinsic::x86_avx512_mask_min_ss_round: + case Intrinsic::x86_avx512_mask_max_sd_round: + case Intrinsic::x86_avx512_mask_min_sd_round: + case Intrinsic::x86_avx512_mask_vfmadd_ss: + case Intrinsic::x86_avx512_mask_vfmadd_sd: + case Intrinsic::x86_avx512_maskz_vfmadd_ss: + case Intrinsic::x86_avx512_maskz_vfmadd_sd: + case Intrinsic::x86_avx512_mask3_vfmadd_ss: + case Intrinsic::x86_avx512_mask3_vfmadd_sd: + case Intrinsic::x86_avx512_mask3_vfmsub_ss: + case Intrinsic::x86_avx512_mask3_vfmsub_sd: + case Intrinsic::x86_avx512_mask3_vfnmsub_ss: + case Intrinsic::x86_avx512_mask3_vfnmsub_sd: + case Intrinsic::x86_fma_vfmadd_ss: + case Intrinsic::x86_fma_vfmsub_ss: + case Intrinsic::x86_fma_vfnmadd_ss: + case Intrinsic::x86_fma_vfnmsub_ss: + case Intrinsic::x86_fma_vfmadd_sd: + case Intrinsic::x86_fma_vfmsub_sd: + case Intrinsic::x86_fma_vfnmadd_sd: + case Intrinsic::x86_fma_vfnmsub_sd: + case Intrinsic::x86_sse_cmp_ss: case Intrinsic::x86_sse_min_ss: case Intrinsic::x86_sse_max_ss: - case Intrinsic::x86_sse_cmp_ss: - case Intrinsic::x86_sse2_add_sd: - case Intrinsic::x86_sse2_sub_sd: - case Intrinsic::x86_sse2_mul_sd: - case Intrinsic::x86_sse2_div_sd: + case Intrinsic::x86_sse2_cmp_sd: case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse2_cmp_sd: { - // These intrinsics only demand the lowest element of the second input - // vector. - Value *Arg1 = II->getArgOperand(1); - unsigned VWidth = Arg1->getType()->getVectorNumElements(); - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { - II->setArgOperand(1, V); - return II; - } - break; - } - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: { - // These intrinsics demand the upper elements of the first input vector and - // the lowest element of the second input vector. - bool MadeChange = false; - Value *Arg0 = II->getArgOperand(0); - Value *Arg1 = II->getArgOperand(1); - unsigned VWidth = Arg0->getType()->getVectorNumElements(); - if (Value *V = SimplifyDemandedVectorEltsHigh(Arg0, VWidth, VWidth - 1)) { - II->setArgOperand(0, V); - MadeChange = true; - } - if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { - II->setArgOperand(1, V); - MadeChange = true; - } - if (MadeChange) - return II; - break; + case Intrinsic::x86_sse41_round_sd: + case Intrinsic::x86_xop_vfrcz_ss: + case Intrinsic::x86_xop_vfrcz_sd: { + unsigned VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } + break; } // Constant fold ashr( <A x Bi>, Ci ). @@ -1727,18 +2022,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_psrai_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_avx512_psrai_q_128: + case Intrinsic::x86_avx512_psrai_q_256: + case Intrinsic::x86_avx512_psrai_d_512: + case Intrinsic::x86_avx512_psrai_q_512: + case Intrinsic::x86_avx512_psrai_w_512: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: + case Intrinsic::x86_avx512_psrli_d_512: + case Intrinsic::x86_avx512_psrli_q_512: + case Intrinsic::x86_avx512_psrli_w_512: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + case Intrinsic::x86_avx512_pslli_d_512: + case Intrinsic::x86_avx512_pslli_q_512: + case Intrinsic::x86_avx512_pslli_w_512: if (Value *V = simplifyX86immShift(*II, *Builder)) return replaceInstUsesWith(*II, V); break; @@ -1747,18 +2053,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_avx2_psra_d: case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: case Intrinsic::x86_sse2_psll_d: case Intrinsic::x86_sse2_psll_q: case Intrinsic::x86_sse2_psll_w: case Intrinsic::x86_avx2_psll_d: case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: { + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: { if (Value *V = simplifyX86immShift(*II, *Builder)) return replaceInstUsesWith(*II, V); @@ -1780,16 +2097,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_psllv_d_256: case Intrinsic::x86_avx2_psllv_q: case Intrinsic::x86_avx2_psllv_q_256: + case Intrinsic::x86_avx512_psllv_d_512: + case Intrinsic::x86_avx512_psllv_q_512: + case Intrinsic::x86_avx512_psllv_w_128: + case Intrinsic::x86_avx512_psllv_w_256: + case Intrinsic::x86_avx512_psllv_w_512: case Intrinsic::x86_avx2_psrav_d: case Intrinsic::x86_avx2_psrav_d_256: + case Intrinsic::x86_avx512_psrav_q_128: + case Intrinsic::x86_avx512_psrav_q_256: + case Intrinsic::x86_avx512_psrav_d_512: + case Intrinsic::x86_avx512_psrav_q_512: + case Intrinsic::x86_avx512_psrav_w_128: + case Intrinsic::x86_avx512_psrav_w_256: + case Intrinsic::x86_avx512_psrav_w_512: case Intrinsic::x86_avx2_psrlv_d: case Intrinsic::x86_avx2_psrlv_d_256: case Intrinsic::x86_avx2_psrlv_q: case Intrinsic::x86_avx2_psrlv_q_256: + case Intrinsic::x86_avx512_psrlv_d_512: + case Intrinsic::x86_avx512_psrlv_q_512: + case Intrinsic::x86_avx512_psrlv_w_128: + case Intrinsic::x86_avx512_psrlv_w_256: + case Intrinsic::x86_avx512_psrlv_w_512: if (Value *V = simplifyX86varShift(*II, *Builder)) return replaceInstUsesWith(*II, V); break; + case Intrinsic::x86_sse2_pmulu_dq: + case Intrinsic::x86_sse41_pmuldq: + case Intrinsic::x86_avx2_pmul_dq: + case Intrinsic::x86_avx2_pmulu_dq: + case Intrinsic::x86_avx512_pmul_dq_512: + case Intrinsic::x86_avx512_pmulu_dq_512: { + unsigned VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt DemandedElts = APInt::getAllOnesValue(VWidth); + if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } + break; + } + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); @@ -1807,10 +2158,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // See if we're dealing with constant values. Constant *C1 = dyn_cast<Constant>(Op1); ConstantInt *CILength = - C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) : nullptr; ConstantInt *CIIndex = - C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) : nullptr; // Attempt to simplify to a constant, shuffle vector or EXTRQI call. @@ -1870,7 +2221,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // See if we're dealing with constant values. Constant *C1 = dyn_cast<Constant>(Op1); ConstantInt *CI11 = - C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) : nullptr; // Attempt to simplify to a constant, shuffle vector or INSERTQI call. @@ -1964,14 +2315,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: if (Value *V = simplifyX86pshufb(*II, *Builder)) return replaceInstUsesWith(*II, V); break; case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: + case Intrinsic::x86_avx512_vpermilvar_ps_512: case Intrinsic::x86_avx_vpermilvar_pd: case Intrinsic::x86_avx_vpermilvar_pd_256: + case Intrinsic::x86_avx512_vpermilvar_pd_512: if (Value *V = simplifyX86vpermilvar(*II, *Builder)) return replaceInstUsesWith(*II, V); break; @@ -1982,6 +2336,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, V); break; + case Intrinsic::x86_avx512_mask_permvar_df_256: + case Intrinsic::x86_avx512_mask_permvar_df_512: + case Intrinsic::x86_avx512_mask_permvar_di_256: + case Intrinsic::x86_avx512_mask_permvar_di_512: + case Intrinsic::x86_avx512_mask_permvar_hi_128: + case Intrinsic::x86_avx512_mask_permvar_hi_256: + case Intrinsic::x86_avx512_mask_permvar_hi_512: + case Intrinsic::x86_avx512_mask_permvar_qi_128: + case Intrinsic::x86_avx512_mask_permvar_qi_256: + case Intrinsic::x86_avx512_mask_permvar_qi_512: + case Intrinsic::x86_avx512_mask_permvar_sf_256: + case Intrinsic::x86_avx512_mask_permvar_sf_512: + case Intrinsic::x86_avx512_mask_permvar_si_256: + case Intrinsic::x86_avx512_mask_permvar_si_512: + if (Value *V = simplifyX86vpermv(*II, *Builder)) { + // We simplified the permuting, now create a select for the masking. + V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2), + *Builder); + return replaceInstUsesWith(*II, V); + } + break; + case Intrinsic::x86_avx_vperm2f128_pd_256: case Intrinsic::x86_avx_vperm2f128_ps_256: case Intrinsic::x86_avx_vperm2f128_si_256: @@ -2104,7 +2480,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { - unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, AC, DT); + unsigned MemAlign = + getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); unsigned AlignArg = II->getNumArgOperands() - 1; ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { @@ -2194,6 +2571,85 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::amdgcn_class: { + enum { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; + + const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | + N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY; + + Value *Src0 = II->getArgOperand(0); + Value *Src1 = II->getArgOperand(1); + const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); + if (!CMask) { + if (isa<UndefValue>(Src0)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + if (isa<UndefValue>(Src1)) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); + break; + } + + uint32_t Mask = CMask->getZExtValue(); + + // If all tests are made, it doesn't matter what the value is. + if ((Mask & FullMask) == FullMask) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true)); + + if ((Mask & FullMask) == 0) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false)); + + if (Mask == (S_NAN | Q_NAN)) { + // Equivalent of isnan. Replace with standard fcmp. + Value *FCmp = Builder->CreateFCmpUNO(Src0, Src0); + FCmp->takeName(II); + return replaceInstUsesWith(*II, FCmp); + } + + const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); + if (!CVal) { + if (isa<UndefValue>(Src0)) + return replaceInstUsesWith(*II, UndefValue::get(II->getType())); + + // Clamp mask to used bits + if ((Mask & FullMask) != Mask) { + CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(), + { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) } + ); + + NewCall->takeName(II); + return replaceInstUsesWith(*II, NewCall); + } + + break; + } + + const APFloat &Val = CVal->getValueAPF(); + + bool Result = + ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || + ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || + ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || + ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || + ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || + ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || + ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || + ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || + ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || + ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); + + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result)); + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. @@ -2243,6 +2699,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } case Intrinsic::lifetime_start: + // Asan needs to poison memory to detect invalid access which is possible + // even for empty lifetime range. + if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress)) + break; + if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start, Intrinsic::lifetime_end, *this)) return nullptr; @@ -2274,24 +2735,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // assume( (load addr) != null ) -> add 'nonnull' metadata to load // (if assume is valid at the load) - if (ICmpInst* ICmp = dyn_cast<ICmpInst>(IIOperand)) { - Value *LHS = ICmp->getOperand(0); - Value *RHS = ICmp->getOperand(1); - if (ICmpInst::ICMP_NE == ICmp->getPredicate() && - isa<LoadInst>(LHS) && - isa<Constant>(RHS) && - RHS->getType()->isPointerTy() && - cast<Constant>(RHS)->isNullValue()) { - LoadInst* LI = cast<LoadInst>(LHS); - if (isValidAssumeForContext(II, LI, DT)) { - MDNode *MD = MDNode::get(II->getContext(), None); - LI->setMetadata(LLVMContext::MD_nonnull, MD); - return eraseInstFromFunction(*II); - } - } + CmpInst::Predicate Pred; + Instruction *LHS; + if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) && + Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load && + LHS->getType()->isPointerTy() && + isValidAssumeForContext(II, LHS, &DT)) { + MDNode *MD = MDNode::get(II->getContext(), None); + LHS->setMetadata(LLVMContext::MD_nonnull, MD); + return eraseInstFromFunction(*II); + // TODO: apply nonnull return attributes to calls and invokes // TODO: apply range metadata for range check patterns? } + // If there is a dominating assume with the same condition as this one, // then this one is redundant, and should be removed. APInt KnownZero(1, 0), KnownOne(1, 0); @@ -2299,6 +2756,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (KnownOne.isAllOnesValue()) return eraseInstFromFunction(*II); + // Update the cache of affected values for this assumption (we might be + // here because we just simplified the condition). + AC.updateAffectedValues(II); break; } case Intrinsic::experimental_gc_relocate: { @@ -2329,7 +2789,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, ConstantPointerNull::get(PT)); // isKnownNonNull -> nonnull attribute - if (isKnownNonNullAt(DerivedPtr, II, DT)) + if (isKnownNonNullAt(DerivedPtr, II, &DT)) II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); } @@ -2389,7 +2849,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { auto InstCombineRAUW = [this](Instruction *From, Value *With) { replaceInstUsesWith(*From, With); }; - LibCallSimplifier Simplifier(DL, TLI, InstCombineRAUW); + LibCallSimplifier Simplifier(DL, &TLI, InstCombineRAUW); if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); @@ -2477,8 +2937,7 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) { /// Improvements for call and invoke instructions. Instruction *InstCombiner::visitCallSite(CallSite CS) { - - if (isAllocLikeFn(CS.getInstruction(), TLI)) + if (isAllocLikeFn(CS.getInstruction(), &TLI)) return visitAllocSite(*CS.getInstruction()); bool Changed = false; @@ -2492,7 +2951,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { for (Value *V : CS.args()) { if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && - isKnownNonNullAt(V, CS.getInstruction(), DT)) + isKnownNonNullAt(V, CS.getInstruction(), &DT)) Indices.push_back(ArgNo + 1); ArgNo++; } @@ -2613,14 +3072,14 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { /// If the callee is a constexpr cast of a function, attempt to move the cast to /// the arguments of the call/invoke. bool InstCombiner::transformConstExprCastCall(CallSite CS) { - Function *Callee = - dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); if (!Callee) return false; - // The prototype of thunks are a lie, don't try to directly call such - // functions. + + // The prototype of a thunk is a lie. Don't directly call such a function. if (Callee->hasFnAttribute("thunk")) return false; + Instruction *Caller = CS.getInstruction(); const AttributeSet &CallerPAL = CS.getAttributes(); @@ -2842,8 +3301,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { CallInst *CI = cast<CallInst>(Caller); NC = Builder->CreateCall(Callee, Args, OpBundles); NC->takeName(CI); - if (CI->isTailCall()) - cast<CallInst>(NC)->setTailCall(); + cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind()); cast<CallInst>(NC)->setCallingConv(CI->getCallingConv()); cast<CallInst>(NC)->setAttributes(NewCallerPAL); } @@ -2966,7 +3424,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, ++Idx; ++I; - } while (1); + } while (true); } // Add any function attributes. @@ -3001,7 +3459,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, ++Idx; ++I; - } while (1); + } while (true); } // Replace the trampoline call with a direct call. Let the generic @@ -3027,10 +3485,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); } else { NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles); - if (cast<CallInst>(Caller)->isTailCall()) - cast<CallInst>(NewCaller)->setTailCall(); - cast<CallInst>(NewCaller)-> - setCallingConv(cast<CallInst>(Caller)->getCallingConv()); + cast<CallInst>(NewCaller)->setTailCallKind( + cast<CallInst>(Caller)->getTailCallKind()); + cast<CallInst>(NewCaller)->setCallingConv( + cast<CallInst>(Caller)->getCallingConv()); cast<CallInst>(NewCaller)->setAttributes(NewPAL); } |