diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/InstCombine')
14 files changed, 2048 insertions, 904 deletions
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 2d2c109f..6f49399 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1,4 +1,4 @@ -//===- InstCombineAddSub.cpp ----------------------------------------------===// +//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/PatternMatch.h" + using namespace llvm; using namespace PatternMatch; @@ -67,17 +68,17 @@ namespace { private: bool insaneIntVal(int V) { return V > 4 || V < -4; } - APFloat *getFpValPtr(void) + APFloat *getFpValPtr() { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); } - const APFloat *getFpValPtr(void) const + const APFloat *getFpValPtr() const { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); } - const APFloat &getFpVal(void) const { + const APFloat &getFpVal() const { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } - APFloat &getFpVal(void) { + APFloat &getFpVal() { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); } @@ -92,8 +93,8 @@ namespace { // TODO: We should get rid of this function when APFloat can be constructed // from an *SIGNED* integer. APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val); - private: + private: bool IsFp; // True iff FpValBuf contains an instance of APFloat. @@ -114,10 +115,10 @@ namespace { /// class FAddend { public: - FAddend() { Val = nullptr; } + FAddend() : Val(nullptr) {} - Value *getSymVal (void) const { return Val; } - const FAddendCoef &getCoef(void) const { return Coeff; } + Value *getSymVal() const { return Val; } + const FAddendCoef &getCoef() const { return Coeff; } bool isConstant() const { return Val == nullptr; } bool isZero() const { return Coeff.isZero(); } @@ -182,7 +183,6 @@ namespace { InstCombiner::BuilderTy *Builder; Instruction *Instr; - private: // Debugging stuff are clustered here. #ifndef NDEBUG unsigned CreateInstrNum; @@ -193,7 +193,8 @@ namespace { void incCreateInstNum() {} #endif }; -} + +} // anonymous namespace //===----------------------------------------------------------------------===// // @@ -602,7 +603,6 @@ Value *FAddCombine::simplify(Instruction *I) { } Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) { - unsigned AddendNum = Addends.size(); assert(AddendNum <= 4 && "Too many addends"); @@ -886,7 +886,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero, return Op0ZeroPosition >= Op1OnePosition; } -/// WillNotOverflowSignedAdd - Return true if we can prove that: +/// Return true if we can prove that: /// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) /// This basically requires proving that the add in the original type would not /// overflow to change the sign bit or have a carry out. @@ -1118,8 +1118,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C, // transform them into (X + (signbit ^ C)) if (XorRHS->getValue().isSignBit()) - return BinaryOperator::CreateAdd(XorLHS, - ConstantExpr::getXor(XorRHS, CI)); + return BinaryOperator::CreateAdd(XorLHS, + ConstantExpr::getXor(XorRHS, CI)); } } @@ -1421,7 +1421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { return Changed ? &I : nullptr; } - /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -1589,7 +1588,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { } } - { Value *Y; // X-(X+Y) == -Y X-(Y+X) == -Y @@ -1611,32 +1609,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(A, B); } - // (sub (select (a, c, b)), (select (a, d, b))) -> (select (a, (sub c, d), 0)) - // (sub (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (sub c, d))) - if (auto *SI0 = dyn_cast<SelectInst>(Op0)) { - if (auto *SI1 = dyn_cast<SelectInst>(Op1)) { - if (SI0->getCondition() == SI1->getCondition()) { - if (Value *V = SimplifySubInst( - SI0->getFalseValue(), SI1->getFalseValue(), I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), - Builder->CreateSub(SI0->getTrueValue(), SI1->getTrueValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap()), - V); - if (Value *V = SimplifySubInst(SI0->getTrueValue(), SI1->getTrueValue(), - I.hasNoSignedWrap(), - I.hasNoUnsignedWrap(), DL, TLI, DT, AC)) - return SelectInst::Create( - SI0->getCondition(), V, - Builder->CreateSub(SI0->getFalseValue(), SI1->getFalseValue(), "", - /*HasNUW=*/I.hasNoUnsignedWrap(), - /*HasNSW=*/I.hasNoSignedWrap())); - } - } - } - if (Op0->hasOneUse()) { Value *Y = nullptr; // ((X | Y) - X) --> (~X & Y) diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 15e0889..95c50d3 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -37,9 +37,9 @@ static inline Value *dyn_castNotVal(Value *V) { return nullptr; } -/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp -/// predicate into a three bit mask. It also returns whether it is an ordered -/// predicate by reference. +/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into +/// a three bit mask. It also returns whether it is an ordered predicate by +/// reference. static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { isOrdered = false; switch (CC) { @@ -64,10 +64,10 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { } } -/// getNewICmpValue - This is the complement of getICmpCode, which turns an -/// opcode and two operands into either a constant true or false, or a brand -/// new ICmp instruction. The sign is passed in to determine which kind -/// of predicate to use in the new icmp instruction. +/// This is the complement of getICmpCode, which turns an opcode and two +/// operands into either a constant true or false, or a brand new ICmp +/// instruction. The sign is passed in to determine which kind of predicate to +/// use in the new icmp instruction. static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { ICmpInst::Predicate NewPred; @@ -76,9 +76,9 @@ static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, return Builder->CreateICmp(NewPred, LHS, RHS); } -/// getFCmpValue - This is the complement of getFCmpCode, which turns an -/// opcode and two operands into either a FCmp instruction. isordered is passed -/// in to determine which kind of predicate to use in the new fcmp instruction. +/// This is the complement of getFCmpCode, which turns an opcode and two +/// operands into either a FCmp instruction. isordered is passed in to determine +/// which kind of predicate to use in the new fcmp instruction. static Value *getFCmpValue(bool isordered, unsigned code, Value *LHS, Value *RHS, InstCombiner::BuilderTy *Builder) { @@ -150,14 +150,13 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) { else //if (Op == Instruction::Xor) BinOp = Builder->CreateXor(NewLHS, NewRHS); - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy); return Builder->CreateCall(F, BinOp); } -// OptAndOp - This handles expressions of the form ((val OP C1) & C2). Where -// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is -// guaranteed to be a binary operator. +/// This handles expressions of the form ((val OP C1) & C2). Where +/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is +/// guaranteed to be a binary operator. Instruction *InstCombiner::OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, @@ -341,10 +340,10 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, return Builder->CreateICmpUGT(Add, LowerBound); } -// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with -// any number of 0s on either side. The 1s are allowed to wrap from LSB to -// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is -// not, since all 1s are not contiguous. +/// Returns true iff Val consists of one contiguous run of 1s with any number +/// of 0s on either side. The 1s are allowed to wrap from LSB to MSB, +/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is +/// not, since all 1s are not contiguous. static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { const APInt& V = Val->getValue(); uint32_t BitWidth = Val->getType()->getBitWidth(); @@ -357,9 +356,8 @@ static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { return true; } -/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, -/// where isSub determines whether the operator is a sub. If we can fold one of -/// the following xforms: +/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines +/// whether the operator is a sub. If we can fold one of the following xforms: /// /// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask /// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 @@ -449,8 +447,8 @@ enum MaskedICmpType { FoldMskICmp_BMask_NotMixed = 512 }; -/// return the set of pattern classes (from MaskedICmpType) -/// that (icmp SCC (A & B), C) satisfies +/// Return the set of pattern classes (from MaskedICmpType) +/// that (icmp SCC (A & B), C) satisfies. static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, ICmpInst::Predicate SCC) { @@ -538,8 +536,8 @@ static unsigned conjugateICmpMask(unsigned Mask) { return NewMask; } -/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z) -/// if possible. The returned predicate is either == or !=. Returns false if +/// Decompose an icmp into the form ((X & Y) pred Z) if possible. +/// The returned predicate is either == or !=. Returns false if /// decomposition fails. static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, Value *&X, Value *&Y, Value *&Z) { @@ -585,10 +583,9 @@ static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred, return true; } -/// foldLogOpOfMaskedICmpsHelper: -/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// return the set of pattern classes (from MaskedICmpType) -/// that both LHS and RHS satisfy +/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// Return the set of pattern classes (from MaskedICmpType) +/// that both LHS and RHS satisfy. static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, Value*& B, Value*& C, Value*& D, Value*& E, @@ -700,9 +697,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC); return left_type & right_type; } -/// foldLogOpOfMaskedICmps: -/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) -/// into a single (icmp(A & X) ==/!= Y) + +/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y). static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, llvm::InstCombiner::BuilderTy *Builder) { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr; @@ -879,7 +876,7 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, return Builder->CreateICmp(NewPred, Input, RangeEnd); } -/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. +/// Fold (icmp)&(icmp) if possible. Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1123,9 +1120,8 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { return nullptr; } -/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_ORD && RHS->getPredicate() == FCmpInst::FCMP_ORD) { @@ -1203,6 +1199,54 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } +/// Match De Morgan's Laws: +/// (~A & ~B) == (~(A | B)) +/// (~A | ~B) == (~(A & B)) +static Instruction *matchDeMorgansLaws(BinaryOperator &I, + InstCombiner::BuilderTy *Builder) { + auto Opcode = I.getOpcode(); + assert((Opcode == Instruction::And || Opcode == Instruction::Or) && + "Trying to match De Morgan's Laws with something other than and/or"); + // Flip the logic operation. + if (Opcode == Instruction::And) + Opcode = Instruction::Or; + else + Opcode = Instruction::And; + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + // TODO: Use pattern matchers instead of dyn_cast. + if (Value *Op0NotVal = dyn_castNotVal(Op0)) + if (Value *Op1NotVal = dyn_castNotVal(Op1)) + if (Op0->hasOneUse() && Op1->hasOneUse()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, + I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(LogicOp); + } + + // De Morgan's Law in disguise: + // (zext(bool A) ^ 1) & (zext(bool B) ^ 1) -> zext(~(A | B)) + // (zext(bool A) ^ 1) | (zext(bool B) ^ 1) -> zext(~(A & B)) + Value *A = nullptr; + Value *B = nullptr; + ConstantInt *C1 = nullptr; + if (match(Op0, m_OneUse(m_Xor(m_ZExt(m_Value(A)), m_ConstantInt(C1)))) && + match(Op1, m_OneUse(m_Xor(m_ZExt(m_Value(B)), m_Specific(C1))))) { + // TODO: This check could be loosened to handle different type sizes. + // Alternatively, we could fix the definition of m_Not to recognize a not + // operation hidden by a zext? + if (A->getType()->isIntegerTy(1) && B->getType()->isIntegerTy(1) && + C1->isOne()) { + Value *LogicOp = Builder->CreateBinOp(Opcode, A, B, + I.getName() + ".demorgan"); + Value *Not = Builder->CreateNot(LogicOp); + return CastInst::CreateZExtOrBitCast(Not, I.getType()); + } + } + + return nullptr; +} + Instruction *InstCombiner::visitAnd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -1273,6 +1317,10 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I)) return BinaryOperator::CreateAnd(V, AndRHS); + // -x & 1 -> x & 1 + if (AndRHSMask == 1 && match(Op0LHS, m_Zero())) + return BinaryOperator::CreateAnd(Op0RHS, AndRHS); + // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS // has 1's for all bits that the subtraction with A might affect. if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { @@ -1329,15 +1377,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return NV; } - - // (~A & ~B) == (~(A | B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(Or); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; { Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr; @@ -1446,14 +1487,15 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return ReplaceInstUsesWith(I, Res); - // fold (and (cast A), (cast B)) -> (cast (and A, B)) - if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + Value *Op0COp = Op0C->getOperand(0); + Type *SrcTy = Op0COp->getType(); + // fold (and (cast A), (cast B)) -> (cast (and A, B)) if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { - Type *SrcTy = Op0C->getOperand(0)->getType(); if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ? SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntOrIntVectorTy()) { - Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); + Value *Op1COp = Op1C->getOperand(0); // Only do this if the casts both really cause code to be generated. if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && @@ -1478,6 +1520,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { } } + // If we are masking off the sign bit of a floating-point value, convert + // this to the canonical fabs intrinsic call and cast back to integer. + // The backend should know how to optimize fabs(). + // TODO: This transform should also apply to vectors. + ConstantInt *CI; + if (isa<BitCastInst>(Op0C) && SrcTy->isFloatingPointTy() && + match(Op1, m_ConstantInt(CI)) && CI->isMaxValue(true)) { + Module *M = I.getModule(); + Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, SrcTy); + Value *Call = Builder->CreateCall(Fabs, Op0COp, "fabs"); + return CastInst::CreateBitOrPointerCast(Call, I.getType()); + } + } + { Value *X = nullptr; bool OpsSwapped = false; @@ -1509,163 +1565,195 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// CollectBSwapParts - Analyze the specified subexpression and see if it is -/// capable of providing pieces of a bswap. The subexpression provides pieces -/// of a bswap if it is proven that each of the non-zero bytes in the output of -/// the expression came from the corresponding "byte swapped" byte in some other -/// value. For example, if the current subexpression is "(shl i32 %X, 24)" then -/// we know that the expression deposits the low byte of %X into the high byte -/// of the bswap result and that all other bytes are zero. This expression is -/// accepted, the high byte of ByteValues is set to X to indicate a correct -/// match. + +/// Analyze the specified subexpression and see if it is capable of providing +/// pieces of a bswap or bitreverse. The subexpression provides a potential +/// piece of a bswap or bitreverse if it can be proven that each non-zero bit in +/// the output of the expression came from a corresponding bit in some other +/// value. This function is recursive, and the end result is a mapping of +/// (value, bitnumber) to bitnumber. It is the caller's responsibility to +/// validate that all `value`s are identical and that the bitnumber to bitnumber +/// mapping is correct for a bswap or bitreverse. +/// +/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know +/// that the expression deposits the low byte of %X into the high byte of the +/// result and that all other bits are zero. This expression is accepted, +/// BitValues[24-31] are set to %X and BitProvenance[24-31] are set to [0-7]. /// /// This function returns true if the match was unsuccessful and false if so. /// On entry to the function the "OverallLeftShift" is a signed integer value -/// indicating the number of bytes that the subexpression is later shifted. For +/// indicating the number of bits that the subexpression is later shifted. For /// example, if the expression is later right shifted by 16 bits, the -/// OverallLeftShift value would be -2 on entry. This is used to specify which -/// byte of ByteValues is actually being set. +/// OverallLeftShift value would be -16 on entry. This is used to specify which +/// bits of BitValues are actually being set. /// -/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding -/// byte is masked to zero by a user. For example, in (X & 255), X will be -/// processed with a bytemask of 1. Because bytemask is 32-bits, this limits -/// this function to working on up to 32-byte (256 bit) values. ByteMask is -/// always in the local (OverallLeftShift) coordinate space. +/// Similarly, BitMask is a bitmask where a bit is clear if its corresponding +/// bit is masked to zero by a user. For example, in (X & 255), X will be +/// processed with a bytemask of 255. BitMask is always in the local +/// (OverallLeftShift) coordinate space. /// -static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, - SmallVectorImpl<Value *> &ByteValues) { +static bool CollectBitParts(Value *V, int OverallLeftShift, APInt BitMask, + SmallVectorImpl<Value *> &BitValues, + SmallVectorImpl<int> &BitProvenance) { if (Instruction *I = dyn_cast<Instruction>(V)) { // If this is an or instruction, it may be an inner node of the bswap. - if (I->getOpcode() == Instruction::Or) { - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues) || - CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, - ByteValues); - } - - // If this is a logical shift by a constant multiple of 8, recurse with - // OverallLeftShift and ByteMask adjusted. + if (I->getOpcode() == Instruction::Or) + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance) || + CollectBitParts(I->getOperand(1), OverallLeftShift, BitMask, + BitValues, BitProvenance); + + // If this is a logical shift by a constant, recurse with OverallLeftShift + // and BitMask adjusted. if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) { unsigned ShAmt = - cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); - // Ensure the shift amount is defined and of a byte value. - if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) + cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined. + if (ShAmt > BitValues.size()) return true; - unsigned ByteShift = ShAmt >> 3; + unsigned BitShift = ShAmt; if (I->getOpcode() == Instruction::Shl) { - // X << 2 -> collect(X, +2) - OverallLeftShift += ByteShift; - ByteMask >>= ByteShift; + // X << C -> collect(X, +C) + OverallLeftShift += BitShift; + BitMask = BitMask.lshr(BitShift); } else { - // X >>u 2 -> collect(X, -2) - OverallLeftShift -= ByteShift; - ByteMask <<= ByteShift; - ByteMask &= (~0U >> (32-ByteValues.size())); + // X >>u C -> collect(X, -C) + OverallLeftShift -= BitShift; + BitMask = BitMask.shl(BitShift); } - if (OverallLeftShift >= (int)ByteValues.size()) return true; - if (OverallLeftShift <= -(int)ByteValues.size()) return true; + if (OverallLeftShift >= (int)BitValues.size()) + return true; + if (OverallLeftShift <= -(int)BitValues.size()) + return true; - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } - // If this is a logical 'and' with a mask that clears bytes, clear the - // corresponding bytes in ByteMask. + // If this is a logical 'and' with a mask that clears bits, clear the + // corresponding bits in BitMask. if (I->getOpcode() == Instruction::And && isa<ConstantInt>(I->getOperand(1))) { - // Scan every byte of the and mask, seeing if the byte is either 0 or 255. - unsigned NumBytes = ByteValues.size(); - APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); + unsigned NumBits = BitValues.size(); + APInt Bit(I->getType()->getPrimitiveSizeInBits(), 1); const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); - for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { - // If this byte is masked out by a later operation, we don't care what + for (unsigned i = 0; i != NumBits; ++i, Bit <<= 1) { + // If this bit is masked out by a later operation, we don't care what // the and mask is. - if ((ByteMask & (1 << i)) == 0) + if (BitMask[i] == 0) continue; - // If the AndMask is all zeros for this byte, clear the bit. - APInt MaskB = AndMask & Byte; + // If the AndMask is zero for this bit, clear the bit. + APInt MaskB = AndMask & Bit; if (MaskB == 0) { - ByteMask &= ~(1U << i); + BitMask.clearBit(i); continue; } - // If the AndMask is not all ones for this byte, it's not a bytezap. - if (MaskB != Byte) - return true; - - // Otherwise, this byte is kept. + // Otherwise, this bit is kept. } - return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, - ByteValues); + return CollectBitParts(I->getOperand(0), OverallLeftShift, BitMask, + BitValues, BitProvenance); } } // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be - // the input value to the bswap. Some observations: 1) if more than one byte - // is demanded from this input, then it could not be successfully assembled - // into a byteswap. At least one of the two bytes would not be aligned with - // their ultimate destination. - if (!isPowerOf2_32(ByteMask)) return true; - unsigned InputByteNo = countTrailingZeros(ByteMask); - - // 2) The input and ultimate destinations must line up: if byte 3 of an i32 - // is demanded, it needs to go into byte 0 of the result. This means that the - // byte needs to be shifted until it lands in the right byte bucket. The - // shift amount depends on the position: if the byte is coming from the high - // part of the value (e.g. byte 3) then it must be shifted right. If from the - // low part, it must be shifted left. - unsigned DestByteNo = InputByteNo + OverallLeftShift; - if (ByteValues.size()-1-DestByteNo != InputByteNo) + // the input value to the bswap/bitreverse. To be part of a bswap or + // bitreverse we must be demanding a contiguous range of bits from it. + unsigned InputBitLen = BitMask.countPopulation(); + unsigned InputBitNo = BitMask.countTrailingZeros(); + if (BitMask.getBitWidth() - BitMask.countLeadingZeros() - InputBitNo != + InputBitLen) + // Not a contiguous set range of bits! return true; - // If the destination byte value is already defined, the values are or'd - // together, which isn't a bswap (unless it's an or of the same bits). - if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) + // We know we're moving a contiguous range of bits from the input to the + // output. Record which bits in the output came from which bits in the input. + unsigned DestBitNo = InputBitNo + OverallLeftShift; + for (unsigned I = 0; I < InputBitLen; ++I) + BitProvenance[DestBitNo + I] = InputBitNo + I; + + // If the destination bit value is already defined, the values are or'd + // together, which isn't a bswap/bitreverse (unless it's an or of the same + // bits). + if (BitValues[DestBitNo] && BitValues[DestBitNo] != V) return true; - ByteValues[DestByteNo] = V; + for (unsigned I = 0; I < InputBitLen; ++I) + BitValues[DestBitNo + I] = V; + return false; } -/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom. -/// If so, insert the new bswap intrinsic and return it. -Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { - IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); - if (!ITy || ITy->getBitWidth() % 16 || - // ByteMask only allows up to 32-byte values. - ITy->getBitWidth() > 32*8) - return nullptr; // Can only bswap pairs of bytes. Can't do vectors. +static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To, + unsigned BitWidth) { + if (From % 8 != To % 8) + return false; + // Convert from bit indices to byte indices and check for a byte reversal. + From >>= 3; + To >>= 3; + BitWidth >>= 3; + return From == BitWidth - To - 1; +} - /// ByteValues - For each byte of the result, we keep track of which value - /// defines each byte. - SmallVector<Value*, 8> ByteValues; - ByteValues.resize(ITy->getBitWidth()/8); +static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To, + unsigned BitWidth) { + return From == BitWidth - To - 1; +} +/// Given an OR instruction, check to see if this is a bswap or bitreverse +/// idiom. If so, insert the new intrinsic and return it. +Instruction *InstCombiner::MatchBSwapOrBitReverse(BinaryOperator &I) { + IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); + if (!ITy) + return nullptr; // Can't do vectors. + unsigned BW = ITy->getBitWidth(); + + /// We keep track of which bit (BitProvenance) inside which value (BitValues) + /// defines each bit in the result. + SmallVector<Value *, 8> BitValues(BW, nullptr); + SmallVector<int, 8> BitProvenance(BW, -1); + // Try to find all the pieces corresponding to the bswap. - uint32_t ByteMask = ~0U >> (32-ByteValues.size()); - if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) + APInt BitMask = APInt::getAllOnesValue(BitValues.size()); + if (CollectBitParts(&I, 0, BitMask, BitValues, BitProvenance)) return nullptr; - // Check to see if all of the bytes come from the same value. - Value *V = ByteValues[0]; - if (!V) return nullptr; // Didn't find a byte? Must be zero. + // Check to see if all of the bits come from the same value. + Value *V = BitValues[0]; + if (!V) return nullptr; // Didn't find a bit? Must be zero. - // Check to make sure that all of the bytes come from the same value. - for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) - if (ByteValues[i] != V) - return nullptr; - Module *M = I.getParent()->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy); + if (!std::all_of(BitValues.begin(), BitValues.end(), + [&](const Value *X) { return X == V; })) + return nullptr; + + // Now, is the bit permutation correct for a bswap or a bitreverse? We can + // only byteswap values with an even number of bytes. + bool OKForBSwap = BW % 16 == 0, OKForBitReverse = true;; + for (unsigned i = 0, e = BitValues.size(); i != e; ++i) { + OKForBSwap &= bitTransformIsCorrectForBSwap(BitProvenance[i], i, BW); + OKForBitReverse &= + bitTransformIsCorrectForBitReverse(BitProvenance[i], i, BW); + } + + Intrinsic::ID Intrin; + if (OKForBSwap) + Intrin = Intrinsic::bswap; + else if (OKForBitReverse) + Intrin = Intrinsic::bitreverse; + else + return nullptr; + + Function *F = Intrinsic::getDeclaration(I.getModule(), Intrin, ITy); return CallInst::Create(F, V); } -/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D). Check -/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then -/// we can simplify this expression to "cond ? C : D or B". +/// We have an expression of the form (A&C)|(B&D). Check if A is (cond?-1:0) +/// and either B or D is ~(cond?-1,0) or (cond?0,-1), then we can simplify this +/// expression to "cond ? C : D or B". static Instruction *MatchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D) { // If A is not a select of -1/0, this cannot match. @@ -1688,7 +1776,7 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B, return nullptr; } -/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. +/// Fold (icmp)|(icmp) if possible. Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI) { ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); @@ -1905,14 +1993,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, case ICmpInst::ICMP_EQ: if (LHS->getOperand(0) == RHS->getOperand(0)) { // if LHSCst and RHSCst differ only by one bit: - // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1 + // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2 assert(LHSCst->getValue().ule(LHSCst->getValue())); APInt Xor = LHSCst->getValue() ^ RHSCst->getValue(); if (Xor.isPowerOf2()) { - Value *NegCst = Builder->getInt(~Xor); - Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst); - return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst); + Value *Cst = Builder->getInt(Xor); + Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst); + return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst); } } @@ -2020,9 +2108,8 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, return nullptr; } -/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of -/// instcombine, this returns a Value which should already be inserted into the -/// function. +/// Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of instcombine, this returns +/// a Value which should already be inserted into the function. Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { if (LHS->getPredicate() == FCmpInst::FCMP_UNO && RHS->getPredicate() == FCmpInst::FCMP_UNO && @@ -2080,7 +2167,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { return nullptr; } -/// FoldOrWithConstants - This helper function folds: +/// This helper function folds: /// /// ((A | B) & C1) | (B & C2) /// @@ -2199,14 +2286,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { ConstantInt *C1 = nullptr, *C2 = nullptr; // (A | B) | C and A | (B | C) -> bswap if possible. + bool OrOfOrs = match(Op0, m_Or(m_Value(), m_Value())) || + match(Op1, m_Or(m_Value(), m_Value())); // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. - if (match(Op0, m_Or(m_Value(), m_Value())) || - match(Op1, m_Or(m_Value(), m_Value())) || - (match(Op0, m_LogicalShift(m_Value(), m_Value())) && - match(Op1, m_LogicalShift(m_Value(), m_Value())))) { - if (Instruction *BSwap = MatchBSwap(I)) + bool OrOfShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value())); + // (A & B) | (C & D) -> bswap if possible. + bool OrOfAnds = match(Op0, m_And(m_Value(), m_Value())) && + match(Op1, m_And(m_Value(), m_Value())); + + if (OrOfOrs || OrOfShifts || OrOfAnds) + if (Instruction *BSwap = MatchBSwapOrBitReverse(I)) return BSwap; - } // (X^C)|Y -> (X|Y)^C iff Y&C == 0 if (Op0->hasOneUse() && @@ -2360,14 +2451,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); - // (~A | ~B) == (~(A & B)) - De Morgan's Law - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal, - I.getName()+".demorgan"); - return BinaryOperator::CreateNot(And); - } + if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + return DeMorgan; // Canonicalize xor to the RHS. bool SwappedForXor = false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6de380b..e3634f2 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -67,8 +67,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { unsigned CopyAlign = MI->getAlignment(); if (CopyAlign < MinAlign) { - MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), - MinAlign, false)); + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false)); return MI; } @@ -198,12 +197,140 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { return nullptr; } +static Value *SimplifyX86immshift(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool LogicalShift = false; + bool ShiftLeft = false; + + switch (II.getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + LogicalShift = false; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: + LogicalShift = true; ShiftLeft = false; + break; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx2_pslli_d: + case Intrinsic::x86_avx2_pslli_q: + case Intrinsic::x86_avx2_pslli_w: + LogicalShift = true; ShiftLeft = true; + break; + } + assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); + + // Simplify if count is constant. + auto Arg1 = II.getArgOperand(1); + auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1); + auto CDV = dyn_cast<ConstantDataVector>(Arg1); + auto CInt = dyn_cast<ConstantInt>(Arg1); + if (!CAZ && !CDV && !CInt) + return nullptr; + + APInt Count(64, 0); + if (CDV) { + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + auto VT = cast<VectorType>(CDV->getType()); + unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); + assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); + unsigned NumSubElts = 64 / BitWidth; + + // Concatenate the sub-elements to create the 64-bit value. + for (unsigned i = 0; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); + Count = Count.shl(BitWidth); + Count |= SubElt->getValue().zextOrTrunc(64); + } + } + else if (CInt) + Count = CInt->getValue(); + + auto Vec = II.getArgOperand(0); + auto VT = cast<VectorType>(Vec->getType()); + auto SVT = VT->getElementType(); + unsigned VWidth = VT->getNumElements(); + unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + + // If shift-by-zero then just return the original value. + if (Count == 0) + return Vec; + + // Handle cases when Shift >= BitWidth. + if (Count.uge(BitWidth)) { + // If LogicalShift - just return zero. + if (LogicalShift) + return ConstantAggregateZero::get(VT); + + // If ArithmeticShift - clamp Shift to (BitWidth - 1). + Count = APInt(64, BitWidth - 1); + } + + // Get a constant vector of the same type as the first operand. + auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); + auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); + + if (ShiftLeft) + return Builder.CreateShl(Vec, ShiftVec); + + if (LogicalShift) + return Builder.CreateLShr(Vec, ShiftVec); + + return Builder.CreateAShr(Vec, ShiftVec); +} + +static Value *SimplifyX86extend(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, + bool SignExtend) { + VectorType *SrcTy = cast<VectorType>(II.getArgOperand(0)->getType()); + VectorType *DstTy = cast<VectorType>(II.getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + SmallVector<int, 8> ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), + UndefValue::get(SrcTy), ShuffleMask); + return SignExtend ? Builder.CreateSExt(SV, DstTy) + : Builder.CreateZExt(SV, DstTy); +} + static Value *SimplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { VectorType *VecTy = cast<VectorType>(II.getType()); assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); - + // The immediate permute control byte looks like this: // [3:0] - zero mask for each 32-bit lane // [5:4] - select one 32-bit destination lane @@ -248,12 +375,202 @@ static Value *SimplifyX86insertps(const IntrinsicInst &II, // Replace the selected destination lane with the selected source lane. ShuffleMask[DestLane] = SourceLane + 4; } - + return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); } return nullptr; } +/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding +/// or conversion to a shuffle vector. +static Value *SimplifyX86extrq(IntrinsicInst &II, Value *Op0, + ConstantInt *CILength, ConstantInt *CIIndex, + InstCombiner::BuilderTy &Builder) { + auto LowConstantHighUndef = [&](uint64_t Val) { + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + }; + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + ConstantInt *CI0 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + + // Attempt to constant fold. + if (CILength && CIIndex) { + // From AMD documentation: "The bit index and field length are each six + // bits in length other bits of the field are ignored." + APInt APIndex = CIIndex->getValue().zextOrTrunc(6); + APInt APLength = CILength->getValue().zextOrTrunc(6); + + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize EXTRQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + Index))); + for (int i = Length; i != 8; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector( + Builder.CreateBitCast(Op0, ShufTy), + ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // Constant Fold - shift Index'th bit to lowest position and mask off + // Length bits. + if (CI0) { + APInt Elt = CI0->getValue(); + Elt = Elt.lshr(Index).zextOrTrunc(Length); + return LowConstantHighUndef(Elt.getZExtValue()); + } + + // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { + Value *Args[] = {Op0, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + return Builder.CreateCall(F, Args); + } + } + + // Constant Fold - extraction from zero is always {zero, undef}. + if (CI0 && CI0->equalsInt(0)) + return LowConstantHighUndef(0); + + return nullptr; +} + +/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant +/// folding or conversion to a shuffle vector. +static Value *SimplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, + APInt APLength, APInt APIndex, + InstCombiner::BuilderTy &Builder) { + + // From AMD documentation: "The bit index and field length are each six bits + // in length other bits of the field are ignored." + APIndex = APIndex.zextOrTrunc(6); + APLength = APLength.zextOrTrunc(6); + + // Attempt to constant fold. + unsigned Index = APIndex.getZExtValue(); + + // From AMD documentation: "a value of zero in the field length is + // defined as length of 64". + unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); + + // From AMD documentation: "If the sum of the bit index + length field + // is greater than 64, the results are undefined". + unsigned End = Index + Length; + + // Note that both field index and field length are 8-bit quantities. + // Since variables 'Index' and 'Length' are unsigned values + // obtained from zero-extending field index and field length + // respectively, their sum should never wrap around. + if (End > 64) + return UndefValue::get(II.getType()); + + // If we are inserting whole bytes, we can convert this to a shuffle. + // Lowering can recognize INSERTQI shuffle masks. + if ((Length % 8) == 0 && (Index % 8) == 0) { + // Convert bit indices to byte indices. + Length /= 8; + Index /= 8; + + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Type *IntTy32 = Type::getInt32Ty(II.getContext()); + VectorType *ShufTy = VectorType::get(IntTy8, 16); + + SmallVector<Constant *, 16> ShuffleMask; + for (int i = 0; i != (int)Index; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 0; i != (int)Length; ++i) + ShuffleMask.push_back( + Constant::getIntegerValue(IntTy32, APInt(32, i + 16))); + for (int i = Index + Length; i != 8; ++i) + ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i))); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(UndefValue::get(IntTy32)); + + Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), + Builder.CreateBitCast(Op1, ShufTy), + ConstantVector::get(ShuffleMask)); + return Builder.CreateBitCast(SV, II.getType()); + } + + // See if we're dealing with constant values. + Constant *C0 = dyn_cast<Constant>(Op0); + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI00 = + C0 ? dyn_cast<ConstantInt>(C0->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CI10 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + + // Constant Fold - insert bottom Length bits starting at the Index'th bit. + if (CI00 && CI10) { + APInt V00 = CI00->getValue(); + APInt V10 = CI10->getValue(); + APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); + V00 = V00 & ~Mask; + V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); + APInt Val = V00 | V10; + Type *IntTy64 = Type::getInt64Ty(II.getContext()); + Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), + UndefValue::get(IntTy64)}; + return ConstantVector::get(Args); + } + + // If we were an INSERTQ call, we'll save demanded elements if we convert to + // INSERTQI. + if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { + Type *IntTy8 = Type::getInt8Ty(II.getContext()); + Constant *CILength = ConstantInt::get(IntTy8, Length, false); + Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); + + Value *Args[] = {Op0, Op1, CILength, CIIndex}; + Module *M = II.getModule(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return Builder.CreateCall(F, Args); + } + + return nullptr; +} + /// The shuffle mask for a perm2*128 selects any two halves of two 256-bit /// source vectors, unless a zero bit is set. If a zero bit is set, /// then ignore that half of the mask and clear that half of the vector. @@ -289,7 +606,7 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // The high bit of the selection field chooses the 1st or 2nd operand. bool LowInputSelect = Imm & 0x02; bool HighInputSelect = Imm & 0x20; - + // The low bit of the selection field chooses the low or high half // of the selected operand. bool LowHalfSelect = Imm & 0x01; @@ -298,11 +615,11 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, // Determine which operand(s) are actually in use for this instruction. Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0); - + // If needed, replace operands based on zero mask. V0 = LowHalfZero ? ZeroVector : V0; V1 = HighHalfZero ? ZeroVector : V1; - + // Permute low half of result. unsigned StartIndex = LowHalfSelect ? HalfSize : 0; for (unsigned i = 0; i < HalfSize; ++i) @@ -319,6 +636,43 @@ static Value *SimplifyX86vperm2(const IntrinsicInst &II, return nullptr; } +/// Decode XOP integer vector comparison intrinsics. +static Value *SimplifyX86vpcom(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) { + uint64_t Imm = CInt->getZExtValue() & 0x7; + VectorType *VecTy = cast<VectorType>(II.getType()); + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + + switch (Imm) { + case 0x0: + Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; + break; + case 0x1: + Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; + break; + case 0x2: + Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; + break; + case 0x3: + Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; + break; + case 0x4: + Pred = ICmpInst::ICMP_EQ; break; + case 0x5: + Pred = ICmpInst::ICMP_NE; break; + case 0x6: + return ConstantInt::getSigned(VecTy, 0); // FALSE + case 0x7: + return ConstantInt::getSigned(VecTy, -1); // TRUE + } + + if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0), II.getArgOperand(1))) + return Builder.CreateSExtOrTrunc(Cmp, VecTy); + } + return nullptr; +} + /// visitCallInst - CallInst simplification. This mostly only handles folding /// of intrinsic instructions. For normal calls, it allows visitCallSite to do /// the heavy lifting. @@ -371,7 +725,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) if (GVSrc->isConstant()) { - Module *M = CI.getParent()->getParent()->getParent(); + Module *M = CI.getModule(); Intrinsic::ID MemCpyID = Intrinsic::memcpy; Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), @@ -400,6 +754,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (Changed) return II; } + auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width, unsigned DemandedWidth) + { + APInt UndefElts(Width, 0); + APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); + return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); + }; + switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { @@ -427,6 +788,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::bitreverse: { + Value *IIOperand = II->getArgOperand(0); + Value *X = nullptr; + + // bitreverse(bitreverse(x)) -> x + if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X)))) + return ReplaceInstUsesWith(CI, X); + break; + } + case Intrinsic::powi: if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { // powi(x, 0) -> 1.0 @@ -669,6 +1040,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: @@ -682,6 +1054,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; + case Intrinsic::x86_vcvtph2ps_128: + case Intrinsic::x86_vcvtph2ps_256: { + auto Arg = II->getArgOperand(0); + auto ArgType = cast<VectorType>(Arg->getType()); + auto RetType = cast<VectorType>(II->getType()); + unsigned ArgWidth = ArgType->getNumElements(); + unsigned RetWidth = RetType->getNumElements(); + assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths"); + assert(ArgType->isIntOrIntVectorTy() && + ArgType->getScalarSizeInBits() == 16 && + "CVTPH2PS input type should be 16-bit integer vector"); + assert(RetType->getScalarType()->isFloatTy() && + "CVTPH2PS output type should be 32-bit float vector"); + + // Constant folding: Convert to generic half to single conversion. + if (isa<ConstantAggregateZero>(Arg)) + return ReplaceInstUsesWith(*II, ConstantAggregateZero::get(RetType)); + + if (isa<ConstantDataVector>(Arg)) { + auto VectorHalfAsShorts = Arg; + if (RetWidth < ArgWidth) { + SmallVector<int, 8> SubVecMask; + for (unsigned i = 0; i != RetWidth; ++i) + SubVecMask.push_back((int)i); + VectorHalfAsShorts = Builder->CreateShuffleVector( + Arg, UndefValue::get(ArgType), SubVecMask); + } + + auto VectorHalfType = + VectorType::get(Type::getHalfTy(II->getContext()), RetWidth); + auto VectorHalfs = + Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType); + auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType); + return ReplaceInstUsesWith(*II, VectorFloats); + } + + // We only use the lowest lanes of the argument. + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) { + II->setArgOperand(0, V); + return II; + } + break; + } + case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: case Intrinsic::x86_sse_cvttss2si: @@ -692,194 +1108,229 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_sse2_cvttsd2si64: { // These intrinsics only demand the 0th element of their input vectors. If // we can simplify the input based on that, do so now. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - APInt DemandedElts(VWidth, 1); - APInt UndefElts(VWidth, 0); - if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), - DemandedElts, UndefElts)) { + Value *Arg = II->getArgOperand(0); + unsigned VWidth = Arg->getType()->getVectorNumElements(); + if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { II->setArgOperand(0, V); return II; } break; } - // Constant fold <A x Bi> << Ci. - // FIXME: We don't handle _dq because it's a shift of an i128, but is - // represented in the IR as <2 x i64>. A per element shift is wrong. - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: + // Constant fold ashr( <A x Bi>, Ci ). + // Constant fold lshr( <A x Bi>, Ci ). + // Constant fold shl( <A x Bi>, Ci ). + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_avx2_psrai_d: + case Intrinsic::x86_avx2_psrai_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_avx2_psrli_d: + case Intrinsic::x86_avx2_psrli_q: + case Intrinsic::x86_avx2_psrli_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_sse2_psrl_d: case Intrinsic::x86_sse2_psrl_q: case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: case Intrinsic::x86_avx2_psrl_d: case Intrinsic::x86_avx2_psrl_q: case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: { - // Simplify if count is constant. To 0 if >= BitWidth, - // otherwise to shl/lshr. - auto CDV = dyn_cast<ConstantDataVector>(II->getArgOperand(1)); - auto CInt = dyn_cast<ConstantInt>(II->getArgOperand(1)); - if (!CDV && !CInt) - break; - ConstantInt *Count; - if (CDV) - Count = cast<ConstantInt>(CDV->getElementAsConstant(0)); - else - Count = CInt; - - auto Vec = II->getArgOperand(0); - auto VT = cast<VectorType>(Vec->getType()); - if (Count->getZExtValue() > - VT->getElementType()->getPrimitiveSizeInBits() - 1) - return ReplaceInstUsesWith( - CI, ConstantAggregateZero::get(Vec->getType())); - - bool isPackedShiftLeft = true; - switch (II->getIntrinsicID()) { - default : break; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: isPackedShiftLeft = false; break; - } - - unsigned VWidth = VT->getNumElements(); - // Get a constant vector of the same type as the first operand. - auto VTCI = ConstantInt::get(VT->getElementType(), Count->getZExtValue()); - if (isPackedShiftLeft) - return BinaryOperator::CreateShl(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); - - return BinaryOperator::CreateLShr(Vec, - Builder->CreateVectorSplat(VWidth, VTCI)); + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: { + if (Value *V = SimplifyX86immshift(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + Value *Arg1 = II->getArgOperand(1); + assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && + "Unexpected packed shift size"); + unsigned VWidth = Arg1->getType()->getVectorNumElements(); + + if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { + II->setArgOperand(1, V); + return II; + } + break; } - case Intrinsic::x86_sse41_pmovsxbw: - case Intrinsic::x86_sse41_pmovsxwd: - case Intrinsic::x86_sse41_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxbd: + case Intrinsic::x86_avx2_pmovsxbq: + case Intrinsic::x86_avx2_pmovsxbw: + case Intrinsic::x86_avx2_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxwd: + case Intrinsic::x86_avx2_pmovsxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_pmovzxbd: + case Intrinsic::x86_sse41_pmovzxbq: case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxdq: case Intrinsic::x86_sse41_pmovzxwd: - case Intrinsic::x86_sse41_pmovzxdq: { - // pmov{s|z}x ignores the upper half of their input vectors. - unsigned VWidth = - cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); - unsigned LowHalfElts = VWidth / 2; - APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts)); - APInt UndefElts(VWidth, 0); - if (Value *TmpV = SimplifyDemandedVectorElts( - II->getArgOperand(0), InputDemandedElts, UndefElts)) { - II->setArgOperand(0, TmpV); + case Intrinsic::x86_sse41_pmovzxwq: + case Intrinsic::x86_avx2_pmovzxbd: + case Intrinsic::x86_avx2_pmovzxbq: + case Intrinsic::x86_avx2_pmovzxbw: + case Intrinsic::x86_avx2_pmovzxdq: + case Intrinsic::x86_avx2_pmovzxwd: + case Intrinsic::x86_avx2_pmovzxwq: + if (Value *V = SimplifyX86extend(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_insertps: + if (Value *V = SimplifyX86insertps(*II, *Builder)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse4a_extrq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 16 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CILength = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)0)) + : nullptr; + ConstantInt *CIIndex = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or EXTRQI call. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) + return ReplaceInstUsesWith(*II, V); + + // EXTRQ only uses the lowest 64-bits of the first 128-bit vector + // operands and the lowest 16-bits of the second. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { + II->setArgOperand(1, V); return II; } break; } - case Intrinsic::x86_sse41_insertps: - if (Value *V = SimplifyX86insertps(*II, *Builder)) + + case Intrinsic::x86_sse4a_extrqi: { + // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining + // bits of the lower 64-bits. The upper 64-bits are undefined. + Value *Op0 = II->getArgOperand(0); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2)); + + // Attempt to simplify to a constant or shuffle vector. + if (Value *V = SimplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder)) return ReplaceInstUsesWith(*II, V); + + // EXTRQI only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::x86_sse4a_insertq: { + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth = Op0->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && + Op1->getType()->getVectorNumElements() == 2 && + "Unexpected operand size"); + + // See if we're dealing with constant values. + Constant *C1 = dyn_cast<Constant>(Op1); + ConstantInt *CI11 = + C1 ? dyn_cast<ConstantInt>(C1->getAggregateElement((unsigned)1)) + : nullptr; + + // Attempt to simplify to a constant, shuffle vector or INSERTQI call. + if (CI11) { + APInt V11 = CI11->getValue(); + APInt Len = V11.zextOrTrunc(6); + APInt Idx = V11.lshr(8).zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQ only uses the lowest 64-bits of the first 128-bit vector + // operand. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { + II->setArgOperand(0, V); + return II; + } break; - + } + case Intrinsic::x86_sse4a_insertqi: { - // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top - // ones undef - // TODO: eventually we should lower this intrinsic to IR - if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) { - if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) { - unsigned Index = CIStart->getZExtValue(); - // From AMD documentation: "a value of zero in the field length is - // defined as length of 64". - unsigned Length = CIWidth->equalsInt(0) ? 64 : CIWidth->getZExtValue(); - - // From AMD documentation: "If the sum of the bit index + length field - // is greater than 64, the results are undefined". - - // Note that both field index and field length are 8-bit quantities. - // Since variables 'Index' and 'Length' are unsigned values - // obtained from zero-extending field index and field length - // respectively, their sum should never wrap around. - if ((Index + Length) > 64) - return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); - - if (Length == 64 && Index == 0) { - Value *Vec = II->getArgOperand(1); - Value *Undef = UndefValue::get(Vec->getType()); - const uint32_t Mask[] = { 0, 2 }; - return ReplaceInstUsesWith( - CI, - Builder->CreateShuffleVector( - Vec, Undef, ConstantDataVector::get( - II->getContext(), makeArrayRef(Mask)))); - - } else if (auto Source = - dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { - if (Source->hasOneUse() && - Source->getArgOperand(1) == II->getArgOperand(1)) { - // If the source of the insert has only one use and it's another - // insert (and they're both inserting from the same vector), try to - // bundle both together. - auto CISourceWidth = - dyn_cast<ConstantInt>(Source->getArgOperand(2)); - auto CISourceStart = - dyn_cast<ConstantInt>(Source->getArgOperand(3)); - if (CISourceStart && CISourceWidth) { - unsigned Start = CIStart->getZExtValue(); - unsigned Width = CIWidth->getZExtValue(); - unsigned End = Start + Width; - unsigned SourceStart = CISourceStart->getZExtValue(); - unsigned SourceWidth = CISourceWidth->getZExtValue(); - unsigned SourceEnd = SourceStart + SourceWidth; - unsigned NewStart, NewWidth; - bool ShouldReplace = false; - if (Start <= SourceStart && SourceStart <= End) { - NewStart = Start; - NewWidth = std::max(End, SourceEnd) - NewStart; - ShouldReplace = true; - } else if (SourceStart <= Start && Start <= SourceEnd) { - NewStart = SourceStart; - NewWidth = std::max(SourceEnd, End) - NewStart; - ShouldReplace = true; - } - - if (ShouldReplace) { - Constant *ConstantWidth = ConstantInt::get( - II->getArgOperand(2)->getType(), NewWidth, false); - Constant *ConstantStart = ConstantInt::get( - II->getArgOperand(3)->getType(), NewStart, false); - Value *Args[4] = { Source->getArgOperand(0), - II->getArgOperand(1), ConstantWidth, - ConstantStart }; - Module *M = CI.getParent()->getParent()->getParent(); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); - return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); - } - } - } - } - } + // INSERTQI: Extract lowest Length bits from lower half of second source and + // insert over first source starting at Index bit. The upper 64-bits are + // undefined. + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); + unsigned VWidth0 = Op0->getType()->getVectorNumElements(); + unsigned VWidth1 = Op1->getType()->getVectorNumElements(); + assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && + Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && + VWidth1 == 2 && "Unexpected operand sizes"); + + // See if we're dealing with constant values. + ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2)); + ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3)); + + // Attempt to simplify to a constant or shuffle vector. + if (CILength && CIIndex) { + APInt Len = CILength->getValue().zextOrTrunc(6); + APInt Idx = CIIndex->getValue().zextOrTrunc(6); + if (Value *V = SimplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder)) + return ReplaceInstUsesWith(*II, V); + } + + // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector + // operands. + if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { + II->setArgOperand(0, V); + return II; + } + + if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { + II->setArgOperand(1, V); + return II; } break; } @@ -894,7 +1345,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // This optimization is convoluted because the intrinsic is defined as // getting a vector of floats or doubles for the ps and pd versions. // FIXME: That should be changed. + + Value *Op0 = II->getArgOperand(0); + Value *Op1 = II->getArgOperand(1); Value *Mask = II->getArgOperand(2); + + // fold (blend A, A, Mask) -> A + if (Op0 == Op1) + return ReplaceInstUsesWith(CI, Op0); + + // Zero Mask - select 1st argument. + if (isa<ConstantAggregateZero>(Mask)) + return ReplaceInstUsesWith(CI, Op0); + + // Constant Mask - select 1st/2nd argument lane based on top bit of mask. if (auto C = dyn_cast<ConstantDataVector>(Mask)) { auto Tyi1 = Builder->getInt1Ty(); auto SelectorType = cast<VectorType>(Mask->getType()); @@ -917,11 +1381,50 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Selectors.push_back(ConstantInt::get(Tyi1, Selector >> (BitWidth - 1))); } auto NewSelector = ConstantVector::get(Selectors); - return SelectInst::Create(NewSelector, II->getArgOperand(1), - II->getArgOperand(0), "blendv"); - } else { - break; + return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); } + break; + } + + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_avx2_pshuf_b: { + // Turn pshufb(V1,mask) -> shuffle(V1,Zero,mask) if mask is a constant. + auto *V = II->getArgOperand(1); + auto *VTy = cast<VectorType>(V->getType()); + unsigned NumElts = VTy->getNumElements(); + assert((NumElts == 16 || NumElts == 32) && + "Unexpected number of elements in shuffle mask!"); + // Initialize the resulting shuffle mask to all zeroes. + uint32_t Indexes[32] = {0}; + + if (auto *Mask = dyn_cast<ConstantDataVector>(V)) { + // Each byte in the shuffle control mask forms an index to permute the + // corresponding byte in the destination operand. + for (unsigned I = 0; I < NumElts; ++I) { + int8_t Index = Mask->getElementAsInteger(I); + // If the most significant bit (bit[7]) of each byte of the shuffle + // control mask is set, then zero is written in the result byte. + // The zero vector is in the right-hand side of the resulting + // shufflevector. + + // The value of each index is the least significant 4 bits of the + // shuffle control byte. + Indexes[I] = (Index < 0) ? NumElts : Index & 0xF; + } + } else if (!isa<ConstantAggregateZero>(V)) + break; + + // The value of each index for the high 128-bit lane is the least + // significant 4 bits of the respective shuffle control byte. + for (unsigned I = 16; I < NumElts; ++I) + Indexes[I] += I & 0xF0; + + auto NewC = ConstantDataVector::get(V->getContext(), + makeArrayRef(Indexes, NumElts)); + auto V1 = II->getArgOperand(0); + auto V2 = Constant::getNullValue(II->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); } case Intrinsic::x86_avx_vpermilvar_ps: @@ -972,6 +1475,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return ReplaceInstUsesWith(*II, V); break; + case Intrinsic::x86_xop_vpcomb: + case Intrinsic::x86_xop_vpcomd: + case Intrinsic::x86_xop_vpcomq: + case Intrinsic::x86_xop_vpcomw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, true)) + return ReplaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_xop_vpcomub: + case Intrinsic::x86_xop_vpcomud: + case Intrinsic::x86_xop_vpcomuq: + case Intrinsic::x86_xop_vpcomuw: + if (Value *V = SimplifyX86vpcom(*II, *Builder, false)) + return ReplaceInstUsesWith(*II, V); + break; + case Intrinsic::ppc_altivec_vperm: // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. // Note that ppc_altivec_vperm has a big-endian bias, so when creating @@ -1115,15 +1634,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // happen when variable allocas are DCE'd. if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { if (SS->getIntrinsicID() == Intrinsic::stacksave) { - BasicBlock::iterator BI = SS; - if (&*++BI == II) + if (&*++SS->getIterator() == II) return EraseInstFromFunction(CI); } } // Scan down this block to see if there is another stack restore in the // same block without an intervening call/alloca. - BasicBlock::iterator BI = II; + BasicBlock::iterator BI(II); TerminatorInst *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { @@ -1153,6 +1671,29 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return EraseInstFromFunction(CI); break; } + case Intrinsic::lifetime_start: { + // Remove trivially empty lifetime_start/end ranges, i.e. a start + // immediately followed by an end (ignoring debuginfo or other + // lifetime markers in between). + BasicBlock::iterator BI = II->getIterator(), BE = II->getParent()->end(); + for (++BI; BI != BE; ++BI) { + if (IntrinsicInst *LTE = dyn_cast<IntrinsicInst>(BI)) { + if (isa<DbgInfoIntrinsic>(LTE) || + LTE->getIntrinsicID() == Intrinsic::lifetime_start) + continue; + if (LTE->getIntrinsicID() == Intrinsic::lifetime_end) { + if (II->getOperand(0) == LTE->getOperand(0) && + II->getOperand(1) == LTE->getOperand(1)) { + EraseInstFromFunction(*LTE); + return EraseInstFromFunction(*II); + } + continue; + } + } + break; + } + break; + } case Intrinsic::assume: { // Canonicalize assume(a && b) -> assume(a); assume(b); // Note: New assumption intrinsics created here are registered by @@ -1233,7 +1774,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } // isKnownNonNull -> nonnull attribute - if (isKnownNonNull(DerivedPtr)) + if (isKnownNonNullAt(DerivedPtr, II, DT, TLI)) II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); // isDereferenceablePointer -> deref attribute @@ -1355,9 +1896,10 @@ static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp, Value *TrampMem) { // Visit all the previous instructions in the basic block, and try to find a // init.trampoline which has a direct path to the adjust.trampoline. - for (BasicBlock::iterator I = AdjustTramp, - E = AdjustTramp->getParent()->begin(); I != E; ) { - Instruction *Inst = --I; + for (BasicBlock::iterator I = AdjustTramp->getIterator(), + E = AdjustTramp->getParent()->begin(); + I != E;) { + Instruction *Inst = &*--I; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) if (II->getIntrinsicID() == Intrinsic::init_trampoline && II->getOperand(0) == TrampMem) @@ -1400,20 +1942,27 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { // Mark any parameters that are known to be non-null with the nonnull // attribute. This is helpful for inlining calls to functions with null // checks on their arguments. + SmallVector<unsigned, 4> Indices; unsigned ArgNo = 0; + for (Value *V : CS.args()) { - if (!CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && - isKnownNonNull(V)) { - AttributeSet AS = CS.getAttributes(); - AS = AS.addAttribute(CS.getInstruction()->getContext(), ArgNo+1, - Attribute::NonNull); - CS.setAttributes(AS); - Changed = true; - } + if (V->getType()->isPointerTy() && !CS.paramHasAttr(ArgNo+1, Attribute::NonNull) && + isKnownNonNullAt(V, CS.getInstruction(), DT, TLI)) + Indices.push_back(ArgNo + 1); ArgNo++; } + assert(ArgNo == CS.arg_size() && "sanity check"); + if (!Indices.empty()) { + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, + Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + Changed = true; + } + // If the callee is a pointer to a function, attempt to move any casts to the // arguments of the call/invoke. Value *Callee = CS.getCalledValue(); @@ -1725,16 +2274,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(), attrVec); + SmallVector<OperandBundleDef, 1> OpBundles; + CS.getOperandBundlesAsDefs(OpBundles); + Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { - NC = Builder->CreateInvoke(Callee, II->getNormalDest(), - II->getUnwindDest(), Args); + NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles); NC->takeName(II); cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); } else { CallInst *CI = cast<CallInst>(Caller); - NC = Builder->CreateCall(Callee, Args); + NC = Builder->CreateCall(Callee, Args, OpBundles); NC->takeName(CI); if (CI->isTailCall()) cast<CallInst>(NC)->setTailCall(); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 48ab0eb..da835a1 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -21,11 +21,11 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear -/// expression. If so, decompose it, returning some value X, such that Val is +/// Analyze 'Val', seeing if it is a simple linear expression. +/// If so, decompose it, returning some value X, such that Val is /// X*Scale+Offset. /// -static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, +static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale, uint64_t &Offset) { if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { Offset = CI->getZExtValue(); @@ -62,7 +62,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, // where C1 is divisible by C2. unsigned SubScale; Value *SubVal = - DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); + decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); Offset += RHS->getZExtValue(); Scale = SubScale; return SubVal; @@ -76,14 +76,14 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, return Val; } -/// PromoteCastOfAllocation - If we find a cast of an allocation instruction, -/// try to eliminate the cast by moving the type information into the alloc. +/// If we find a cast of an allocation instruction, try to eliminate the cast by +/// moving the type information into the alloc. Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI) { PointerType *PTy = cast<PointerType>(CI.getType()); BuilderTy AllocaBuilder(*Builder); - AllocaBuilder.SetInsertPoint(AI.getParent(), &AI); + AllocaBuilder.SetInsertPoint(&AI); // Get the type really allocated and the type casted to. Type *AllocElTy = AI.getAllocatedType(); @@ -114,7 +114,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, unsigned ArraySizeScale; uint64_t ArrayOffset; Value *NumElements = // See if the array size is a decomposable linear expr. - DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); + decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); // If we can now satisfy the modulus, by using a non-1 scale, we really can // do the xform. @@ -154,9 +154,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, return ReplaceInstUsesWith(CI, New); } -/// EvaluateInDifferentType - Given an expression that -/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually -/// insert the code to evaluate the expression. +/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns +/// true for, actually insert the code to evaluate the expression. Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned) { if (Constant *C = dyn_cast<Constant>(V)) { @@ -261,9 +260,9 @@ isEliminableCastPair(const CastInst *CI, ///< First cast instruction return Instruction::CastOps(Res); } -/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually -/// results in any code being generated and is interesting to optimize out. If -/// the cast can be eliminated by some other simple transformation, we prefer +/// Return true if the cast from "V to Ty" actually results in any code being +/// generated and is interesting to optimize out. +/// If the cast can be eliminated by some other simple transformation, we prefer /// to do the simplification first. bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V, Type *Ty) { @@ -318,9 +317,9 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { return nullptr; } -/// CanEvaluateTruncated - Return true if we can evaluate the specified -/// expression tree as type Ty instead of its larger type, and arrive with the -/// same value. This is used by code that tries to eliminate truncates. +/// Return true if we can evaluate the specified expression tree as type Ty +/// instead of its larger type, and arrive with the same value. +/// This is used by code that tries to eliminate truncates. /// /// Ty will always be a type smaller than V. We should return true if trunc(V) /// can be computed by computing V in the smaller type. If V is an instruction, @@ -329,7 +328,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, +static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, Instruction *CxtI) { // We can always evaluate constants in another type. if (isa<Constant>(V)) @@ -359,8 +358,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, case Instruction::Or: case Instruction::Xor: // These operators can all arbitrarily be extended or truncated. - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); case Instruction::UDiv: case Instruction::URem: { @@ -371,8 +370,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth); if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) && IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && - CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) && + canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI); } } break; @@ -383,7 +382,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { uint32_t BitWidth = Ty->getScalarSizeInBits(); if (CI->getLimitedValue(BitWidth) < BitWidth) - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } break; case Instruction::LShr: @@ -396,7 +395,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, if (IC.MaskedValueIsZero(I->getOperand(0), APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) && CI->getLimitedValue(BitWidth) < BitWidth) { - return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); + return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI); } } break; @@ -410,8 +409,8 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return true; case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); - return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && - CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); + return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) && + canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI); } case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -419,7 +418,7 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateTruncated(IncValue, Ty, IC, CxtI)) + if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI)) return false; return true; } @@ -431,6 +430,50 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC, return false; } +/// Given a vector that is bitcast to an integer, optionally logically +/// right-shifted, and truncated, convert it to an extractelement. +/// Example (big endian): +/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32 +/// ---> +/// extractelement <4 x i32> %X, 1 +static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC, + const DataLayout &DL) { + Value *TruncOp = Trunc.getOperand(0); + Type *DestType = Trunc.getType(); + if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType)) + return nullptr; + + Value *VecInput = nullptr; + ConstantInt *ShiftVal = nullptr; + if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)), + m_LShr(m_BitCast(m_Value(VecInput)), + m_ConstantInt(ShiftVal)))) || + !isa<VectorType>(VecInput->getType())) + return nullptr; + + VectorType *VecType = cast<VectorType>(VecInput->getType()); + unsigned VecWidth = VecType->getPrimitiveSizeInBits(); + unsigned DestWidth = DestType->getPrimitiveSizeInBits(); + unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0; + + if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0)) + return nullptr; + + // If the element type of the vector doesn't match the result type, + // bitcast it to a vector type that we can extract from. + unsigned NumVecElts = VecWidth / DestWidth; + if (VecType->getElementType() != DestType) { + VecType = VectorType::get(DestType, NumVecElts); + VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc"); + } + + unsigned Elt = ShiftAmount / DestWidth; + if (DL.isBigEndian()) + Elt = NumVecElts - 1 - Elt; + + return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); +} + Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (Instruction *Result = commonCastTransforms(CI)) return Result; @@ -441,7 +484,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // min/max. Value *LHS, *RHS; if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0))) - if (matchSelectPattern(SI, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; // See if we can simplify any instructions used by the input whose sole @@ -457,7 +500,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateTruncated(Src, DestTy, *this, &CI)) { + canEvaluateTruncated(Src, DestTy, *this, &CI)) { // If this cast is a truncate, evaluting in a different type always // eliminates the cast, so it is always a win. @@ -470,7 +513,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(Src->getType(), 1); + Constant *One = ConstantInt::get(SrcTy, 1); Src = Builder->CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); @@ -489,31 +532,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // If the shift amount is larger than the size of A, then the result is // known to be zero because all the input bits got shifted out. if (Cst->getZExtValue() >= ASize) - return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType())); + return ReplaceInstUsesWith(CI, Constant::getNullValue(DestTy)); // Since we're doing an lshr and a zero extend, and know that the shift // amount is smaller than ASize, it is always safe to do the shift in A's // type, then zero extend or truncate to the result. Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue()); Shift->takeName(Src); - return CastInst::CreateIntegerCast(Shift, CI.getType(), false); + return CastInst::CreateIntegerCast(Shift, DestTy, false); + } + + // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type + // conversion. + // It works because bits coming from sign extension have the same value as + // the sign bit of the original value; performing ashr instead of lshr + // generates bits of the same value as the sign bit. + if (Src->hasOneUse() && + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && + cast<Instruction>(Src)->getOperand(0)->hasOneUse()) { + const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + // This optimization can be only performed when zero bits generated by + // the original lshr aren't pulled into the value after truncation, so we + // can only shift by values smaller than the size of destination type (in + // bits). + if (Cst->getValue().ult(ASize)) { + Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, CI.getType(), true); + } } // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest // type isn't non-native. - if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) && - ShouldChangeType(Src->getType(), CI.getType()) && + if (Src->hasOneUse() && isa<IntegerType>(SrcTy) && + ShouldChangeType(SrcTy, DestTy) && match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) { - Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr"); + Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr"); return BinaryOperator::CreateAnd(NewTrunc, - ConstantExpr::getTrunc(Cst, CI.getType())); + ConstantExpr::getTrunc(Cst, DestTy)); } + if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL)) + return I; + return nullptr; } -/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (zext icmp) to bitwise / integer operations in order to eliminate +/// the icmp. Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, bool DoXform) { // If we are just checking for a icmp eq of a single bit and zext'ing it @@ -637,8 +703,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, return nullptr; } -/// CanEvaluateZExtd - Determine if the specified value can be computed in the -/// specified wider type and produce the same low bits. If not, return false. +/// Determine if the specified value can be computed in the specified wider type +/// and produce the same low bits. If not, return false. /// /// If this function returns true, it can also return a non-zero number of bits /// (in BitsToClear) which indicates that the value it computes is correct for @@ -655,7 +721,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, /// clear the top bits anyway, doing this has no extra cost. /// /// This function works on both vectors and scalars. -static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, +static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, InstCombiner &IC, Instruction *CxtI) { BitsToClear = 0; if (isa<Constant>(V)) @@ -685,8 +751,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, case Instruction::Add: case Instruction::Sub: case Instruction::Mul: - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI)) return false; // These can all be promoted if neither operand has 'bits to clear'. if (BitsToClear == 0 && Tmp == 0) @@ -713,7 +779,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote shl(x, cst) if we can promote x. Since shl overwrites the // upper bits we can reduce BitsToClear by the shift amount. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; uint64_t ShiftAmt = Amt->getZExtValue(); BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0; @@ -724,7 +790,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // We can promote lshr(x, cst) if we can promote x. This requires the // ultimate 'and' to clear out the high zero bits we're clearing out though. if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { - if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI)) return false; BitsToClear += Amt->getZExtValue(); if (BitsToClear > V->getType()->getScalarSizeInBits()) @@ -734,8 +800,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // Cannot promote variable LSHR. return false; case Instruction::Select: - if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || - !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || + if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) || + !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear are // known zero in the disagreeing side. Tmp != BitsToClear) @@ -747,10 +813,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear, // get into trouble with cyclic PHIs here because we only consider // instructions with a single use. PHINode *PN = cast<PHINode>(I); - if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) + if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI)) return false; for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) - if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || + if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) || // TODO: If important, we could handle the case when the BitsToClear // are known zero in the disagreeing input. Tmp != BitsToClear) @@ -787,13 +853,13 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // strange. unsigned BitsToClear; if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { + canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) { assert(BitsToClear < SrcTy->getScalarSizeInBits() && "Unreasonable BitsToClear"); // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid zero extend: " << CI); + " to avoid zero extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, false); assert(Res->getType() == DestTy); @@ -897,8 +963,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { return nullptr; } -/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations -/// in order to eliminate the icmp. +/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp. Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1); ICmpInst::Predicate Pred = ICI->getPredicate(); @@ -985,15 +1050,14 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) { return nullptr; } -/// CanEvaluateSExtd - Return true if we can take the specified value -/// and return it as type Ty without inserting any new casts and without -/// changing the value of the common low bits. This is used by code that tries -/// to promote integer operations to a wider types will allow us to eliminate -/// the extension. +/// Return true if we can take the specified value and return it as type Ty +/// without inserting any new casts and without changing the value of the common +/// low bits. This is used by code that tries to promote integer operations to +/// a wider types will allow us to eliminate the extension. /// /// This function works on both vectors and scalars. /// -static bool CanEvaluateSExtd(Value *V, Type *Ty) { +static bool canEvaluateSExtd(Value *V, Type *Ty) { assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && "Can't sign extend type to a smaller type"); // If this is a constant, it can be trivially promoted. @@ -1023,15 +1087,15 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { case Instruction::Sub: case Instruction::Mul: // These operators can all arbitrarily be extended if their inputs can. - return CanEvaluateSExtd(I->getOperand(0), Ty) && - CanEvaluateSExtd(I->getOperand(1), Ty); + return canEvaluateSExtd(I->getOperand(0), Ty) && + canEvaluateSExtd(I->getOperand(1), Ty); //case Instruction::Shl: TODO //case Instruction::LShr: TODO case Instruction::Select: - return CanEvaluateSExtd(I->getOperand(1), Ty) && - CanEvaluateSExtd(I->getOperand(2), Ty); + return canEvaluateSExtd(I->getOperand(1), Ty) && + canEvaluateSExtd(I->getOperand(2), Ty); case Instruction::PHI: { // We can change a phi if we can change all operands. Note that we never @@ -1039,7 +1103,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { // instructions with a single use. PHINode *PN = cast<PHINode>(I); for (Value *IncValue : PN->incoming_values()) - if (!CanEvaluateSExtd(IncValue, Ty)) return false; + if (!canEvaluateSExtd(IncValue, Ty)) return false; return true; } default: @@ -1081,10 +1145,10 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // expression tree to something weird like i93 unless the source is also // strange. if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && - CanEvaluateSExtd(Src, DestTy)) { + canEvaluateSExtd(Src, DestTy)) { // Okay, we can transform this! Insert the new expression now. DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" - " to avoid sign extend: " << CI); + " to avoid sign extend: " << CI << '\n'); Value *Res = EvaluateInDifferentType(Src, DestTy, true); assert(Res->getType() == DestTy); @@ -1149,9 +1213,9 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { } -/// FitsInFPType - Return a Constant* for the specified FP constant if it fits +/// Return a Constant* for the specified floating-point constant if it fits /// in the specified FP type without changing its value. -static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { +static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { bool losesInfo; APFloat F = CFP->getValueAPF(); (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); @@ -1160,12 +1224,12 @@ static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { return nullptr; } -/// LookThroughFPExtensions - If this is an fp extension instruction, look +/// If this is a floating-point extension instruction, look /// through it until we get the source value. -static Value *LookThroughFPExtensions(Value *V) { +static Value *lookThroughFPExtensions(Value *V) { if (Instruction *I = dyn_cast<Instruction>(V)) if (I->getOpcode() == Instruction::FPExt) - return LookThroughFPExtensions(I->getOperand(0)); + return lookThroughFPExtensions(I->getOperand(0)); // If this value is a constant, return the constant in the smallest FP type // that can accurately represent it. This allows us to turn @@ -1174,14 +1238,14 @@ static Value *LookThroughFPExtensions(Value *V) { if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext())) return V; // No constant folding of this. // See if the value can be truncated to half and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEhalf)) return V; // See if the value can be truncated to float and then reextended. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEsingle)) return V; if (CFP->getType()->isDoubleTy()) return V; // Won't shrink. - if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble)) + if (Value *V = fitsInFPType(CFP, APFloat::IEEEdouble)) return V; // Don't try to shrink to various long double types. } @@ -1193,7 +1257,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { if (Instruction *I = commonCastTransforms(CI)) return I; // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to - // simpilify this expression to avoid one or more of the trunc/extend + // simplify this expression to avoid one or more of the trunc/extend // operations if we can do so without changing the numerical results. // // The exact manner in which the widths of the operands interact to limit @@ -1201,8 +1265,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // is explained below in the various case statements. BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0)); if (OpI && OpI->hasOneUse()) { - Value *LHSOrig = LookThroughFPExtensions(OpI->getOperand(0)); - Value *RHSOrig = LookThroughFPExtensions(OpI->getOperand(1)); + Value *LHSOrig = lookThroughFPExtensions(OpI->getOperand(0)); + Value *RHSOrig = lookThroughFPExtensions(OpI->getOperand(1)); unsigned OpWidth = OpI->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSOrig->getType()->getFPMantissaWidth(); unsigned RHSWidth = RHSOrig->getType()->getFPMantissaWidth(); @@ -1307,10 +1371,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { // (fptrunc (select cond, R1, Cst)) --> // (select cond, (fptrunc R1), (fptrunc Cst)) + // + // - but only if this isn't part of a min/max operation, else we'll + // ruin min/max canonical form which is to have the select and + // compare's operands be of the same type with no casts to look through. + Value *LHS, *RHS; SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)); if (SI && (isa<ConstantFP>(SI->getOperand(1)) || - isa<ConstantFP>(SI->getOperand(2)))) { + isa<ConstantFP>(SI->getOperand(2))) && + matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) { Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1), CI.getType()); Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2), @@ -1327,9 +1397,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0), CI.getType()); Type *IntrinsicType[] = { CI.getType() }; - Function *Overload = - Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(), - II->getIntrinsicID(), IntrinsicType); + Function *Overload = Intrinsic::getDeclaration( + CI.getModule(), II->getIntrinsicID(), IntrinsicType); Value *Args[] = { InnerTrunc }; return CallInst::Create(Overload, Args, II->getName()); @@ -1483,12 +1552,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false); } -/// OptimizeVectorResize - This input value (which is known to have vector type) -/// is being zero extended or truncated to the specified vector type. Try to -/// replace it with a shuffle (and vector/vector bitcast) if possible. +/// This input value (which is known to have vector type) is being zero extended +/// or truncated to the specified vector type. +/// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy, +static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, InstCombiner &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. @@ -1548,8 +1617,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { return Value / Ty->getPrimitiveSizeInBits(); } -/// CollectInsertionElements - V is a value which is inserted into a vector of -/// VecEltTy. Look through the value to see if we can decompose it into +/// V is a value which is inserted into a vector of VecEltTy. +/// Look through the value to see if we can decompose it into /// insertions into the vector. See the example in the comment for /// OptimizeIntegerToVectorInsertions for the pattern this handles. /// The type of V is always a non-zero multiple of VecEltTy's size. @@ -1558,7 +1627,7 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) { /// /// This returns false if the pattern can't be matched or true if it can, /// filling in Elements with the elements found here. -static bool CollectInsertionElements(Value *V, unsigned Shift, +static bool collectInsertionElements(Value *V, unsigned Shift, SmallVectorImpl<Value *> &Elements, Type *VecEltTy, bool isBigEndian) { assert(isMultipleOfTypeSize(Shift, VecEltTy) && @@ -1595,7 +1664,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, // If the constant is the size of a vector element, we just need to bitcast // it to the right type so it gets properly inserted. if (NumElts == 1) - return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), + return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), Shift, Elements, VecEltTy, isBigEndian); // Okay, this is a constant that covers multiple elements. Slice it up into @@ -1611,7 +1680,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), ShiftI)); Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); - if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, + if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy, isBigEndian)) return false; } @@ -1625,19 +1694,19 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, switch (I->getOpcode()) { default: return false; // Unhandled case. case Instruction::BitCast: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::ZExt: if (!isMultipleOfTypeSize( I->getOperand(0)->getType()->getPrimitiveSizeInBits(), VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Or: - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian) && - CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, + collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy, isBigEndian); case Instruction::Shl: { // Must be shifting by a constant that is a multiple of the element size. @@ -1645,7 +1714,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, if (!CI) return false; Shift += CI->getZExtValue(); if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false; - return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, + return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy, isBigEndian); } @@ -1653,8 +1722,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, } -/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we -/// may be doing shifts and ors to assemble the elements of the vector manually. +/// If the input is an 'or' instruction, we may be doing shifts and ors to +/// assemble the elements of the vector manually. /// Try to rip the code out and replace it with insertelements. This is to /// optimize code like this: /// @@ -1667,13 +1736,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift, /// %tmp43 = bitcast i64 %ins35 to <2 x float> /// /// Into two insertelements that do "buildvector{%inc, %inc5}". -static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, +static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI, InstCombiner &IC) { VectorType *DestVecTy = cast<VectorType>(CI.getType()); Value *IntInput = CI.getOperand(0); SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); - if (!CollectInsertionElements(IntInput, 0, Elements, + if (!collectInsertionElements(IntInput, 0, Elements, DestVecTy->getElementType(), IC.getDataLayout().isBigEndian())) return nullptr; @@ -1692,63 +1761,29 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, return Result; } - -/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double -/// bitcast. The various long double bitcasts can't get in here. -static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC, +/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the +/// vector followed by extract element. The backend tends to handle bitcasts of +/// vectors better than bitcasts of scalars because vector registers are +/// usually not type-specific like scalar integer or scalar floating-point. +static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast, + InstCombiner &IC, const DataLayout &DL) { - Value *Src = CI.getOperand(0); - Type *DestTy = CI.getType(); - - // If this is a bitcast from int to float, check to see if the int is an - // extraction from a vector. - Value *VecInput = nullptr; - // bitcast(trunc(bitcast(somevector))) - if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } - - unsigned Elt = 0; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } + // TODO: Create and use a pattern matcher for ExtractElementInst. + auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0)); + if (!ExtElt || !ExtElt->hasOneUse()) + return nullptr; - // bitcast(trunc(lshr(bitcast(somevector), cst)) - ConstantInt *ShAmt = nullptr; - if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), - m_ConstantInt(ShAmt)))) && - isa<VectorType>(VecInput->getType())) { - VectorType *VecTy = cast<VectorType>(VecInput->getType()); - unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); - if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 && - ShAmt->getZExtValue() % DestWidth == 0) { - // If the element type of the vector doesn't match the result type, - // bitcast it to be a vector type we can extract from. - if (VecTy->getElementType() != DestTy) { - VecTy = VectorType::get(DestTy, - VecTy->getPrimitiveSizeInBits() / DestWidth); - VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); - } + // The bitcast must be to a vectorizable type, otherwise we can't make a new + // type to extract from. + Type *DestType = BitCast.getType(); + if (!VectorType::isValidElementType(DestType)) + return nullptr; - unsigned Elt = ShAmt->getZExtValue() / DestWidth; - if (DL.isBigEndian()) - Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt; - return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); - } - } - return nullptr; + unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements(); + auto *NewVecType = VectorType::get(DestType, NumElts); + auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(), + NewVecType, "bc"); + return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand()); } Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { @@ -1794,11 +1829,6 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } - // Try to optimize int -> float bitcasts. - if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) - if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL)) - return I; - if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) { Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType()); @@ -1815,7 +1845,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { CastInst *SrcCast = cast<CastInst>(Src); if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) if (isa<VectorType>(BCIn->getOperand(0)->getType())) - if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0), + if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) return I; } @@ -1823,7 +1853,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If the input is an 'or' instruction, we may be doing shifts and ors to // assemble the elements of the vector manually. Try to rip the code out // and replace it with insertelements. - if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this)) + if (Value *V = optimizeIntegerToVectorInsertions(CI, *this)) return ReplaceInstUsesWith(CI, V); } } @@ -1872,6 +1902,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } + if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL)) + return I; + if (SrcTy->isPointerTy()) return commonPointerCastTransforms(CI); return commonCastTransforms(CI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 95bba3c..c0786af 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -216,8 +216,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, Max = KnownOne|UnknownBits; } - - /// FoldCmpLoadFromIndexedGlobal - Called we see this pattern: /// cmp pred (load (gep GV, ...)), cmpcst /// where GV is a global variable with a constant initializer. Try to simplify @@ -371,7 +369,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, } } - // If this element is in range, update our magic bitvector. if (i < 64 && IsTrueForElt) MagicBitvector |= 1ULL << i; @@ -469,7 +466,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); } - // If a magic bitvector captures the entire comparison state // of this load, replace it with computation that does: // ((magic_cst >> i) & 1) != 0 @@ -496,7 +492,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, return nullptr; } - /// EvaluateGEPOffsetExpression - Return a value that can be used to compare /// the *offset* implied by a GEP to zero. For example, if we have &A[i], we /// want to return 'i' for "icmp ne i, 0". Note that, in general, indices can @@ -562,8 +557,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, } } - - // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. @@ -737,6 +730,83 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, return nullptr; } +Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, + Value *Other) { + assert(ICI.isEquality() && "Cannot fold non-equality comparison."); + + // It would be tempting to fold away comparisons between allocas and any + // pointer not based on that alloca (e.g. an argument). However, even + // though such pointers cannot alias, they can still compare equal. + // + // But LLVM doesn't specify where allocas get their memory, so if the alloca + // doesn't escape we can argue that it's impossible to guess its value, and we + // can therefore act as if any such guesses are wrong. + // + // The code below checks that the alloca doesn't escape, and that it's only + // used in a comparison once (the current instruction). The + // single-comparison-use condition ensures that we're trivially folding all + // comparisons against the alloca consistently, and avoids the risk of + // erroneously folding a comparison of the pointer with itself. + + unsigned MaxIter = 32; // Break cycles and bound to constant-time. + + SmallVector<Use *, 32> Worklist; + for (Use &U : Alloca->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + + unsigned NumCmps = 0; + while (!Worklist.empty()) { + assert(Worklist.size() <= MaxIter); + Use *U = Worklist.pop_back_val(); + Value *V = U->getUser(); + --MaxIter; + + if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) || + isa<SelectInst>(V)) { + // Track the uses. + } else if (isa<LoadInst>(V)) { + // Loading from the pointer doesn't escape it. + continue; + } else if (auto *SI = dyn_cast<StoreInst>(V)) { + // Storing *to* the pointer is fine, but storing the pointer escapes it. + if (SI->getValueOperand() == U->get()) + return nullptr; + continue; + } else if (isa<ICmpInst>(V)) { + if (NumCmps++) + return nullptr; // Found more than one cmp. + continue; + } else if (auto *Intrin = dyn_cast<IntrinsicInst>(V)) { + switch (Intrin->getIntrinsicID()) { + // These intrinsics don't escape or compare the pointer. Memset is safe + // because we don't allow ptrtoint. Memcpy and memmove are safe because + // we don't allow stores, so src cannot point to V. + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::dbg_declare: case Intrinsic::dbg_value: + case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: + continue; + default: + return nullptr; + } + } else { + return nullptr; + } + for (Use &U : V->uses()) { + if (Worklist.size() >= MaxIter) + return nullptr; + Worklist.push_back(&U); + } + } + + Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); + return ReplaceInstUsesWith( + ICI, + ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); +} + /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, @@ -851,7 +921,6 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, // to the same result value. HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false); } - } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0. if (CmpRHSV == 0) { // (X / pos) op 0 // Can't overflow. e.g. X/2 op 0 --> [-1, 2) @@ -996,7 +1065,6 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, return Res; } - // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; @@ -1074,18 +1142,22 @@ Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A, if (AP1 == AP2) return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); - // Get the distance between the highest bit that's set. int Shift; - // Both the constants are negative, take their positive to calculate log. if (IsAShr && AP1.isNegative()) - // Get the ones' complement of AP2 and AP1 when computing the distance. - Shift = (~AP2).logBase2() - (~AP1).logBase2(); + Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); else - Shift = AP2.logBase2() - AP1.logBase2(); + Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); if (Shift > 0) { - if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift)) + if (IsAShr && AP1 == AP2.ashr(Shift)) { + // There are multiple solutions if we are comparing against -1 and the LHS + // of the ashr is not a power of two. + if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) + return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); + return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } else if (AP1 == AP2.lshr(Shift)) { return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); + } } // Shifting const2 will never be equal to const1. return getConstant(false); @@ -1145,6 +1217,14 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, switch (LHSI->getOpcode()) { case Instruction::Trunc: + if (RHS->isOne() && RHSV.getBitWidth() > 1) { + // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI->getOperand(0), m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } if (ICI.isEquality() && LHSI->hasOneUse()) { // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all // of the high bits truncated out of x are known. @@ -1447,9 +1527,35 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE, LHSI->getOperand(0), SubOne(RHS)); + + // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1) + // iff C is a power of 2 + if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) { + if (auto *CI = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { + const APInt &AI = CI->getValue(); + int32_t ExactLogBase2 = AI.exactLogBase2(); + if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { + Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1); + Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy); + return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ + ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + Trunc, Constant::getNullValue(NTy)); + } + } + } break; case Instruction::Or: { + if (RHS->isOne()) { + // icmp slt signum(V) 1 --> icmp slt V, 1 + Value *V = nullptr; + if (ICI.getPredicate() == ICmpInst::ICMP_SLT && + match(LHSI, m_Signum(m_Value(V)))) + return new ICmpInst(ICmpInst::ICMP_SLT, V, + ConstantInt::get(V->getType(), 1)); + } + if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse()) break; Value *P, *Q; @@ -2083,11 +2189,9 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // If the pattern matches, truncate the inputs to the narrower type and // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. - Module *M = I.getParent()->getParent()->getParent(); - Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); - Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, - NewType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy *Builder = IC.Builder; @@ -2123,6 +2227,12 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, return true; }; + // If the overflow check was an add followed by a compare, the insertion point + // may be pointing to the compare. We want to insert the new instructions + // before the add in case there are uses of the add between the add and the + // compare. + Builder->SetInsertPoint(&OrigI); + switch (OCF) { case OCF_INVALID: llvm_unreachable("bad overflow check kind!"); @@ -2223,7 +2333,9 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); - Instruction *MulInstr = cast<Instruction>(MulVal); + auto *MulInstr = dyn_cast<Instruction>(MulVal); + if (!MulInstr) + return nullptr; assert(MulInstr->getOpcode() == Instruction::Mul); auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)), @@ -2357,7 +2469,6 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, InstCombiner::BuilderTy *Builder = IC.Builder; Builder->SetInsertPoint(MulInstr); - Module *M = I.getParent()->getParent()->getParent(); // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) Value *MulA = A, *MulB = B; @@ -2365,8 +2476,8 @@ static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder->CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder->CreateZExt(B, MulType); - Value *F = - Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, MulType); + Value *F = Intrinsic::getDeclaration(I.getModule(), + Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul"); IC.Worklist.Add(MulInstr); @@ -2468,7 +2579,6 @@ static APInt DemandedBitsLHSMask(ICmpInst &I, default: return APInt::getAllOnesValue(BitWidth); } - } /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst @@ -2905,7 +3015,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_NE: { @@ -2950,7 +3059,6 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ConstantInt::get(X->getType(), CI->countTrailingZeros())); } - break; } case ICmpInst::ICMP_ULT: @@ -3103,7 +3211,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { // comparison into the select arms, which will cause one to be // constant folded and the select turned into a bitwise or. Value *Op1 = nullptr, *Op2 = nullptr; - ConstantInt *CI = 0; + ConstantInt *CI = nullptr; if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); CI = dyn_cast<ConstantInt>(Op1); @@ -3177,6 +3285,17 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { ICmpInst::getSwappedPredicate(I.getPredicate()), I)) return NI; + // Try to optimize equality comparisons against alloca-based pointers. + if (Op0->getType()->isPointerTy() && I.isEquality()) { + assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?"); + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op0, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op1)) + return New; + if (auto *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Op1, DL))) + if (Instruction *New = FoldAllocaCmp(I, Alloca, Op0)) + return New; + } + // Test to see if the operands of the icmp are casted versions of other // values. If the ptr->ptr cast can be stripped off both arguments, we do so // now. @@ -3304,6 +3423,26 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { match(B, m_One())) return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); + // icmp sgt X, (Y + -1) -> icmp sge X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); + + // icmp sle X, (Y + -1) -> icmp slt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && + match(D, m_AllOnes())) + return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); + + // icmp sge X, (Y + 1) -> icmp sgt X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); + + // icmp slt X, (Y + 1) -> icmp sle X, Y + if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && + match(D, m_One())) + return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); + // if C1 has greater magnitude than C2: // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y // s.t. C3 = C1 - C2 @@ -3473,6 +3612,18 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } } + + if (BO0) { + // Transform A & (L - 1) `ult` L --> L != 0 + auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes()); + auto BitwiseAnd = + m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); + + if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { + auto *Zero = Constant::getNullValue(BO0->getType()); + return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); + } + } } { Value *A, *B; @@ -3697,15 +3848,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); - // Check to see that the input is converted from an integer type that is small - // enough that preserves all bits. TODO: check here for "known" sign bits. - // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. - unsigned InputSize = IntTy->getScalarSizeInBits(); - - // If this is a uitofp instruction, we need an extra bit to hold the sign. bool LHSUnsigned = isa<UIToFPInst>(LHSI); - if (LHSUnsigned) - ++InputSize; if (I.isEquality()) { FCmpInst::Predicate P = I.getPredicate(); @@ -3732,13 +3875,30 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, // equality compares as integer? } - // Comparisons with zero are a special case where we know we won't lose - // information. - bool IsCmpZero = RHS.isPosZero(); + // Check to see that the input is converted from an integer type that is small + // enough that preserves all bits. TODO: check here for "known" sign bits. + // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. + unsigned InputSize = IntTy->getScalarSizeInBits(); - // If the conversion would lose info, don't hack on this. - if ((int)InputSize > MantissaWidth && !IsCmpZero) - return nullptr; + // Following test does NOT adjust InputSize downwards for signed inputs, + // because the most negative value still requires all the mantissa bits + // to distinguish it from one less than that value. + if ((int)InputSize > MantissaWidth) { + // Conversion would lose accuracy. Check if loss can impact comparison. + int Exp = ilogb(RHS); + if (Exp == APFloat::IEK_Inf) { + int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics())); + if (MaxExponent < (int)InputSize - !LHSUnsigned) + // Conversion could create infinity. + return nullptr; + } else { + // Note that if RHS is zero or NaN, then Exp is negative + // and first condition is trivially false. + if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned) + // Conversion could affect comparison. + return nullptr; + } + } // Otherwise, we can potentially simplify the comparison. We know that it // will always come through as an integer value and we know the constant is diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ac934f1..534f670 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -281,6 +281,7 @@ public: ICmpInst::Predicate Pred); Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I); + Instruction *FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other); Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1, BinaryOperator &I); Instruction *commonCastTransforms(CastInst &CI); @@ -341,6 +342,7 @@ public: const unsigned SIOpd); private: + bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool ShouldChangeType(Type *From, Type *To) const; Value *dyn_castNegVal(Value *V) const; Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const; @@ -360,6 +362,11 @@ private: /// \brief Try to optimize a sequence of instructions checking if an operation /// on LHS and RHS overflows. /// + /// If this overflow check is done via one of the overflow check intrinsics, + /// then CtxI has to be the call instruction calling that intrinsic. If this + /// overflow check is done by arithmetic followed by a compare, then CtxI has + /// to be the arithmetic instruction. + /// /// If a simplification is possible, stores the simplified result of the /// operation in OperationResult and result of the overflow check in /// OverflowResult, and return true. If no simplification is possible, @@ -393,7 +400,7 @@ public: assert(New && !New->getParent() && "New instruction already inserted into a basic block!"); BasicBlock *BB = Old.getParent(); - BB->getInstList().insert(&Old, New); // Insert inst + BB->getInstList().insert(Old.getIterator(), New); // Insert inst Worklist.Add(New); return New; } @@ -539,6 +546,7 @@ private: Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); + Instruction *FoldPHIArgZextsIntoPHI(PHINode &PN); Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS, ConstantInt *AndRHS, BinaryOperator &TheAnd); @@ -548,7 +556,7 @@ private: Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, bool isSigned, bool Inside); Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); - Instruction *MatchBSwap(BinaryOperator &I); + Instruction *MatchBSwapOrBitReverse(BinaryOperator &I); bool SimplifyStoreAtEndOfBlock(StoreInst &SI); Instruction *SimplifyMemTransfer(MemIntrinsic *MI); Instruction *SimplifyMemSet(MemSetInst *MI); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index e3179db..47406b9 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" @@ -90,21 +91,23 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, if (CS.isCallee(&U)) continue; + unsigned DataOpNo = CS.getDataOperandNo(&U); + bool IsArgOperand = CS.isArgOperand(&U); + // Inalloca arguments are clobbered by the call. - unsigned ArgNo = CS.getArgumentNo(&U); - if (CS.isInAllocaArgument(ArgNo)) + if (IsArgOperand && CS.isInAllocaArgument(DataOpNo)) return false; // If this is a readonly/readnone call site, then we know it is just a // load (but one that potentially returns the value itself), so we can // ignore it if we know that the value isn't captured. if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + (CS.getInstruction()->use_empty() || CS.doesNotCapture(DataOpNo))) continue; // If this is being passed as a byval argument, the caller is making a // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) + if (IsArgOperand && CS.isByValArgument(DataOpNo)) continue; } @@ -186,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { // Scan to the end of the allocation instructions, to skip over a block of // allocas if possible...also skip interleaved debug info // - BasicBlock::iterator It = New; + BasicBlock::iterator It(New); while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It; @@ -367,7 +370,13 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT MDB.createRange(NonNullInt, NullInt)); } break; - + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These only directly apply if the new type is also a pointer. + if (NewTy->isPointerTy()) + NewLoad->setMetadata(ID, N); + break; case LLVMContext::MD_range: // FIXME: It would be nice to propagate this in some way, but the type // conversions make it hard. If the new type is a pointer, we could @@ -418,6 +427,9 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value case LLVMContext::MD_invariant_load: case LLVMContext::MD_nonnull: case LLVMContext::MD_range: + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: // These don't apply for stores. break; } @@ -511,16 +523,46 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { if (!T->isAggregateType()) return nullptr; - assert(LI.getAlignment() && "Alignement must be set at this point"); + assert(LI.getAlignment() && "Alignment must be set at this point"); if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), ".unpack"); return IC.ReplaceInstUsesWith(LI, IC.Builder->CreateInsertValue( UndefValue::get(T), NewLoad, 0, LI.getName())); } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return nullptr; + + auto Name = LI.getName(); + SmallString<16> LoadName = Name; + LoadName += ".unpack"; + SmallString<16> EltName = Name; + EltName += ".elt"; + auto *Addr = LI.getPointerOperand(); + Value *V = UndefValue::get(T); + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), EltName); + auto *L = IC.Builder->CreateLoad(ST->getTypeAtIndex(i), Ptr, LoadName); + V = IC.Builder->CreateInsertValue(V, L, i); + } + + V->setName(Name); + return IC.ReplaceInstUsesWith(LI, V); } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -681,7 +723,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, // FIXME: If the GEP is not inbounds, and there are extra indices after the // one we'll replace, those could cause the address computation to wrap // (rendering the IsAllNonNegative() check below insufficient). We can do - // better, ignoring zero indicies (and other indicies we can prove small + // better, ignoring zero indices (and other indices we can prove small // enough not to wrap). if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds()) return false; @@ -748,19 +790,19 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // Do really simple store-to-load forwarding and load CSE, to catch cases // where there are several consecutive memory accesses to the same location, // separated by a few arithmetic operations. - BasicBlock::iterator BBI = &LI; + BasicBlock::iterator BBI(LI); AAMDNodes AATags; - if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI, - 6, AA, &AATags)) { + if (Value *AvailableVal = + FindAvailableLoadedValue(Op, LI.getParent(), BBI, + DefMaxInstsToScan, AA, &AATags)) { if (LoadInst *NLI = dyn_cast<LoadInst>(AvailableVal)) { unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_invariant_load, - LLVMContext::MD_nonnull, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull, + LLVMContext::MD_invariant_group, LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null}; combineMetadata(NLI, &LI, KnownIDs); }; @@ -822,7 +864,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } // load (select (cond, null, P)) -> load P - if (isa<ConstantPointerNull>(SI->getOperand(1)) && + if (isa<ConstantPointerNull>(SI->getOperand(1)) && LI.getPointerAddressSpace() == 0) { LI.setOperand(0, SI->getOperand(2)); return &LI; @@ -857,7 +899,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { /// /// \returns true if the store was successfully combined away. This indicates /// the caller must erase the store instruction. We have to let the caller erase -/// the store instruction sas otherwise there is no way to signal whether it was +/// the store instruction as otherwise there is no way to signal whether it was /// combined or not: IC.EraseInstFromFunction returns a null pointer. static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) { // FIXME: We could probably with some care handle both volatile and atomic @@ -893,11 +935,38 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) { if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. - if (ST->getNumElements() == 1) { + unsigned Count = ST->getNumElements(); + if (Count == 1) { V = IC.Builder->CreateExtractValue(V, 0); combineStoreToNewValue(IC, SI, V); return true; } + + // We don't want to break loads with padding here as we'd loose + // the knowledge that padding exists for the rest of the pipeline. + const DataLayout &DL = IC.getDataLayout(); + auto *SL = DL.getStructLayout(ST); + if (SL->hasPadding()) + return false; + + SmallString<16> EltName = V->getName(); + EltName += ".elt"; + auto *Addr = SI.getPointerOperand(); + SmallString<16> AddrName = Addr->getName(); + AddrName += ".repack"; + auto *IdxType = Type::getInt32Ty(ST->getContext()); + auto *Zero = ConstantInt::get(IdxType, 0); + for (unsigned i = 0; i < Count; i++) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i), + }; + auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices), AddrName); + auto *Val = IC.Builder->CreateExtractValue(V, i, EltName); + IC.Builder->CreateStore(Val, Ptr); + } + + return true; } if (auto *AT = dyn_cast<ArrayType>(T)) { @@ -971,9 +1040,9 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return &SI; } - // Don't hack volatile/atomic stores. - // FIXME: Some bits are legal for atomic stores; needs refactoring. - if (!SI.isSimple()) return nullptr; + // Don't hack volatile/ordered stores. + // FIXME: Some bits are legal for ordered atomic stores; needs refactoring. + if (!SI.isUnordered()) return nullptr; // If the RHS is an alloca with a single use, zapify the store, making the // alloca dead. @@ -991,7 +1060,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // Do really simple DSE, to catch cases where there are several consecutive // stores to the same location, separated by a few arithmetic operations. This // situation often occurs with bitfield accesses. - BasicBlock::iterator BBI = &SI; + BasicBlock::iterator BBI(SI); for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; --ScanInsts) { --BBI; @@ -1005,7 +1074,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { // Prev store isn't volatile, and stores to the same location? - if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1), + if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1))) { ++NumDeadStore; ++BBI; @@ -1019,9 +1088,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { // the pointer we're loading and is producing the pointer we're storing, // then *this* store is dead (X = load P; store X -> P). if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { - if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) && - LI->isSimple()) + if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) { + assert(SI.isUnordered() && "can't eliminate ordering operation"); return EraseInstFromFunction(SI); + } // Otherwise, this is a load from some other location. Stores before it // may not be dead. @@ -1047,10 +1117,14 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (isa<UndefValue>(Val)) return EraseInstFromFunction(SI); + // The code below needs to be audited and adjusted for unordered atomics + if (!SI.isSimple()) + return nullptr; + // If this store is the last instruction in the basic block (possibly // excepting debug info instructions), and if the block ends with an // unconditional branch, try to move it to the successor block. - BBI = &SI; + BBI = SI.getIterator(); do { ++BBI; } while (isa<DbgInfoIntrinsic>(BBI) || @@ -1106,7 +1180,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { return false; // Verify that the other block ends in a branch and is not otherwise empty. - BasicBlock::iterator BBI = OtherBB->getTerminator(); + BasicBlock::iterator BBI(OtherBB->getTerminator()); BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); if (!OtherBr || BBI == OtherBB->begin()) return false; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a554e9f..7ad0efc 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -22,9 +22,9 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// simplifyValueKnownNonZero - The specific integer value is used in a context -/// where it is known to be non-zero. If this allows us to simplify the -/// computation, do so and return the new operand, otherwise return null. +/// The specific integer value is used in a context where it is known to be +/// non-zero. If this allows us to simplify the computation, do so and return +/// the new operand, otherwise return null. static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, Instruction &CxtI) { // If V has multiple uses, then we would have to do more analysis to determine @@ -76,8 +76,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC, } -/// MultiplyOverflows - True if the multiply can not be expressed in an int -/// this size. +/// True if the multiply can not be expressed in an int this size. static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product, bool IsSigned) { bool Overflow; @@ -95,6 +94,14 @@ static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient, assert(C1.getBitWidth() == C2.getBitWidth() && "Inconsistent width of constants!"); + // Bail if we will divide by zero. + if (C2.isMinValue()) + return false; + + // Bail if we would divide INT_MIN by -1. + if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue()) + return false; + APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned); if (IsSigned) APInt::sdivrem(C1, C2, Quotient, Remainder); @@ -705,8 +712,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { return Changed ? &I : nullptr; } -/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select -/// instruction. +/// Try to fold a divide or remainder of a select instruction. bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { SelectInst *SI = cast<SelectInst>(I.getOperand(1)); @@ -740,7 +746,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { return true; // Scan the current block backward, looking for other uses of SI. - BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin(); + BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin(); while (BBI != BBFront) { --BBI; @@ -754,10 +760,10 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { I != E; ++I) { if (*I == SI) { *I = SI->getOperand(NonNullOperand); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } else if (*I == SelectCond) { *I = Builder->getInt1(NonNullOperand == 1); - Worklist.Add(BBI); + Worklist.Add(&*BBI); } } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 460f6eb..f1aa98b 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "instcombine" @@ -245,7 +246,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { /// non-address-taken alloca. Doing so will cause us to not promote the alloca /// to a register. static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { - BasicBlock::iterator BBI = L, E = L->getParent()->end(); + BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end(); for (++BBI; BBI != E; ++BBI) if (BBI->mayWriteToMemory()) @@ -349,24 +350,40 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { Value *InVal = FirstLI->getOperand(0); NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + LoadInst *NewLI = new LoadInst(NewPN, "", isVolatile, LoadAlignment); + + unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_dereferenceable_or_null, + }; - // Add all operands to the new PHI. + for (unsigned ID : KnownIDs) + NewLI->setMetadata(ID, FirstLI->getMetadata(ID)); + + // Add all operands to the new PHI and combine TBAA metadata. for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { - Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); + LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i)); + combineMetadata(NewLI, LI, KnownIDs); + Value *NewInVal = LI->getOperand(0); if (NewInVal != InVal) InVal = nullptr; NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); } - Value *PhiVal; if (InVal) { // The new PHI unions all of the same values together. This is really // common, so we handle it intelligently here for compile-time speed. - PhiVal = InVal; + NewLI->setOperand(0, InVal); delete NewPN; } else { InsertNewInstBefore(NewPN, PN); - PhiVal = NewPN; } // If this was a volatile load that we are merging, make sure to loop through @@ -376,17 +393,94 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { for (Value *IncValue : PN.incoming_values()) cast<LoadInst>(IncValue)->setVolatile(false); - LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment); NewLI->setDebugLoc(FirstLI->getDebugLoc()); return NewLI; } +/// TODO: This function could handle other cast types, but then it might +/// require special-casing a cast from the 'i1' type. See the comment in +/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types. +Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = Phi.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + + // Early exit for the common case of a phi with two operands. These are + // handled elsewhere. See the comment below where we check the count of zexts + // and constants for more details. + unsigned NumIncomingValues = Phi.getNumIncomingValues(); + if (NumIncomingValues < 3) + return nullptr; + // Find the narrower type specified by the first zext. + Type *NarrowType = nullptr; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + NarrowType = Zext->getSrcTy(); + break; + } + } + if (!NarrowType) + return nullptr; + + // Walk the phi operands checking that we only have zexts or constants that + // we can shrink for free. Store the new operands for the new phi. + SmallVector<Value *, 4> NewIncoming; + unsigned NumZexts = 0; + unsigned NumConsts = 0; + for (Value *V : Phi.incoming_values()) { + if (auto *Zext = dyn_cast<ZExtInst>(V)) { + // All zexts must be identical and have one use. + if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUse()) + return nullptr; + NewIncoming.push_back(Zext->getOperand(0)); + NumZexts++; + } else if (auto *C = dyn_cast<Constant>(V)) { + // Make sure that constants can fit in the new type. + Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType); + if (ConstantExpr::getZExt(Trunc, C->getType()) != C) + return nullptr; + NewIncoming.push_back(Trunc); + NumConsts++; + } else { + // If it's not a cast or a constant, bail out. + return nullptr; + } + } + + // The more common cases of a phi with no constant operands or just one + // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi() + // respectively. FoldOpIntoPhi() wants to do the opposite transform that is + // performed here. It tries to replicate a cast in the phi operand's basic + // block to expose other folding opportunities. Thus, InstCombine will + // infinite loop without this check. + if (NumConsts == 0 || NumZexts < 2) + return nullptr; + + // All incoming values are zexts or constants that are safe to truncate. + // Create a new phi node of the narrow type, phi together all of the new + // operands, and zext the result back to the original type. + PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues, + Phi.getName() + ".shrunk"); + for (unsigned i = 0; i != NumIncomingValues; ++i) + NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i)); + + InsertNewInstBefore(NewPhi, Phi); + return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); +} /// If all operands to a PHI node are the same "unary" operator and they all are /// only used by the PHI, PHI together their inputs, and do the operation once, /// to the result of the PHI. Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { + // We cannot create a new instruction after the PHI if the terminator is an + // EHPad because there is no valid insertion point. + if (TerminatorInst *TI = PN.getParent()->getTerminator()) + if (TI->isEHPad()) + return nullptr; + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); if (isa<GetElementPtrInst>(FirstInst)) @@ -740,7 +834,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { } // Otherwise, do an extract in the predecessor. - Builder->SetInsertPoint(Pred, Pred->getTerminator()); + Builder->SetInsertPoint(Pred->getTerminator()); Value *Res = InVal; if (Offset) Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(), @@ -787,6 +881,9 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) { if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AC)) return ReplaceInstUsesWith(PN, V); + if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN)) + return Result; + // If all PHI operands are the same operation, pull them through the PHI, // reducing code size. if (isa<Instruction>(PN.getIncomingValue(0)) && diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f51442a..776704d 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -38,7 +38,8 @@ getInverseMinMaxSelectPattern(SelectPatternFlavor SPF) { } } -static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { +static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF, + bool Ordered=false) { switch (SPF) { default: llvm_unreachable("unhandled!"); @@ -51,17 +52,22 @@ static CmpInst::Predicate getICmpPredicateForMinMax(SelectPatternFlavor SPF) { return ICmpInst::ICMP_SGT; case SPF_UMAX: return ICmpInst::ICMP_UGT; + case SPF_FMINNUM: + return Ordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; + case SPF_FMAXNUM: + return Ordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; } } static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder, SelectPatternFlavor SPF, Value *A, Value *B) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF); + assert(CmpInst::isIntPredicate(Pred)); return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B); } -/// GetSelectFoldableOperands - We want to turn code that looks like this: +/// We want to turn code that looks like this: /// %C = or %A, %B /// %D = select %cond, %C, %A /// into: @@ -90,8 +96,8 @@ static unsigned GetSelectFoldableOperands(Instruction *I) { } } -/// GetSelectFoldableConstant - For the same transformation as the previous -/// function, return the identity constant that goes into the select. +/// For the same transformation as the previous function, return the identity +/// constant that goes into the select. static Constant *GetSelectFoldableConstant(Instruction *I) { switch (I->getOpcode()) { default: llvm_unreachable("This cannot happen!"); @@ -110,7 +116,7 @@ static Constant *GetSelectFoldableConstant(Instruction *I) { } } -/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI +/// Here we have (select c, TI, FI), and we know that TI and FI /// have the same opcode and only one use each. Try to simplify this. Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI) { @@ -197,8 +203,8 @@ static bool isSelect01(Constant *C1, Constant *C2) { C2I->isOne() || C2I->isAllOnesValue(); } -/// FoldSelectIntoOp - Try fold the select into one of the operands to -/// facilitate further optimization. +/// Try to fold the select into one of the operands to allow further +/// optimization. Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, Value *FalseVal) { // See the comment above GetSelectFoldableOperands for a description of the @@ -276,7 +282,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, return nullptr; } -/// foldSelectICmpAndOr - We want to turn: +/// We want to turn: /// (select (icmp eq (and X, C1), 0), Y, (or Y, C2)) /// into: /// (or (shl (and X, C1), C3), y) @@ -394,9 +400,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, return nullptr; } -/// visitSelectInstWithICmp - Visit a SelectInst that has an -/// ICmpInst as its first operand. -/// +/// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { bool Changed = false; @@ -595,10 +599,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, } -/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a -/// PHI node (but the two may be in different blocks). See if the true/false -/// values (V) are live in all of the predecessor blocks of the PHI. For -/// example, cases like this cannot be mapped: +/// SI is a select whose condition is a PHI node (but the two may be in +/// different blocks). See if the true/false values (V) are live in all of the +/// predecessor blocks of the PHI. For example, cases like this can't be mapped: /// /// X = phi [ C1, BB1], [C2, BB2] /// Y = add @@ -632,7 +635,7 @@ static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V, return false; } -/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form: +/// We have an SPF (e.g. a min or max) of an SPF of the form: /// SPF2(SPF1(A, B), C) Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, @@ -745,10 +748,10 @@ Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, return nullptr; } -/// foldSelectICmpAnd - If one of the constants is zero (we know they can't -/// both be) and we have an icmp instruction with zero, and we have an 'and' -/// with the non-constant value and a power of two we can turn the select -/// into a shift on the result of the 'and'. +/// If one of the constants is zero (we know they can't both be) and we have an +/// icmp instruction with zero, and we have an 'and' with the non-constant value +/// and a power of two we can turn the select into a shift on the result of the +/// 'and'. static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, ConstantInt *FalseVal, InstCombiner::BuilderTy *Builder) { @@ -926,6 +929,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->SetFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal, FCI->getName() + ".inv"); @@ -967,6 +972,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + Builder->SetFastMathFlags(FCI->getFastMathFlags()); Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal, FCI->getName() + ".inv"); @@ -1054,35 +1061,50 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } // See if we can fold the select into one of our operands. - if (SI.getType()->isIntOrIntVectorTy()) { + if (SI.getType()->isIntOrIntVectorTy() || SI.getType()->isFPOrFPVectorTy()) { if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) return FoldI; Value *LHS, *RHS, *LHS2, *RHS2; Instruction::CastOps CastOp; - SelectPatternFlavor SPF = matchSelectPattern(&SI, LHS, RHS, &CastOp); + SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp); + auto SPF = SPR.Flavor; - if (SPF) { + if (SelectPatternResult::isMinOrMax(SPF)) { // Canonicalize so that type casts are outside select patterns. if (LHS->getType()->getPrimitiveSizeInBits() != SI.getType()->getPrimitiveSizeInBits()) { - CmpInst::Predicate Pred = getICmpPredicateForMinMax(SPF); - Value *Cmp = Builder->CreateICmp(Pred, LHS, RHS); + CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF, SPR.Ordered); + + Value *Cmp; + if (CmpInst::isIntPredicate(Pred)) { + Cmp = Builder->CreateICmp(Pred, LHS, RHS); + } else { + IRBuilder<>::FastMathFlagGuard FMFG(*Builder); + auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); + Builder->SetFastMathFlags(FMF); + Cmp = Builder->CreateFCmp(Pred, LHS, RHS); + } + Value *NewSI = Builder->CreateCast(CastOp, Builder->CreateSelect(Cmp, LHS, RHS), SI.getType()); return ReplaceInstUsesWith(SI, NewSI); } + } + if (SPF) { // MAX(MAX(a, b), a) -> MAX(a, b) // MIN(MIN(a, b), a) -> MIN(a, b) // MAX(MIN(a, b), a) -> a // MIN(MAX(a, b), a) -> a - if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2)) + // ABS(ABS(a)) -> ABS(a) + // NABS(NABS(a)) -> NABS(a) + if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, SI, SPF, RHS)) return R; - if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2)) + if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor) if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2, SI, SPF, LHS)) return R; diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index d04ed58..0c7defa 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -55,7 +55,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return nullptr; } -/// CanEvaluateShifted - See if we can compute the specified value, but shifted +/// See if we can compute the specified value, but shifted /// logically to the left or right by some number of bits. This should return /// true if the expression can be computed for the same cost as the current /// expression tree. This is used to eliminate extraneous shifting from things @@ -184,7 +184,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, } } -/// GetShiftedValue - When CanEvaluateShifted returned true for an expression, +/// When CanEvaluateShifted returned true for an expression, /// this value inserts the new computation that produces the shifted value. static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, InstCombiner &IC, const DataLayout &DL) { diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 80628b2..743d514 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -410,9 +410,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, // If this is a select as part of a min/max pattern, don't simplify any // further in case we break the structure. Value *LHS, *RHS; - if (matchSelectPattern(I, LHS, RHS) != SPF_UNKNOWN) + if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) return nullptr; - + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero, RHSKnownOne, Depth + 1) || SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero, @@ -1057,7 +1057,13 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts); if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) { for (unsigned i = 0; i < VWidth; i++) { - if (CV->getAggregateElement(i)->isNullValue()) + Constant *CElt = CV->getAggregateElement(i); + // Method isNullValue always returns false when called on a + // ConstantExpr. If CElt is a ConstantExpr then skip it in order to + // to avoid propagating incorrect information. + if (isa<ConstantExpr>(CElt)) + continue; + if (CElt->isNullValue()) LeftDemanded.clearBit(i); else RightDemanded.clearBit(i); @@ -1082,6 +1088,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (!VTy) break; unsigned InVWidth = VTy->getNumElements(); APInt InputDemandedElts(InVWidth, 0); + UndefElts2 = APInt(InVWidth, 0); unsigned Ratio; if (VWidth == InVWidth) { @@ -1089,29 +1096,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // elements as are demanded of us. Ratio = 1; InputDemandedElts = DemandedElts; - } else if (VWidth > InVWidth) { - // Untested so far. - break; - - // If there are more elements in the result than there are in the source, - // then an input element is live if any of the corresponding output - // elements are live. - Ratio = VWidth/InVWidth; - for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an input element is live if any of the + // corresponding output elements are live. + Ratio = VWidth / InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) if (DemandedElts[OutIdx]) - InputDemandedElts.setBit(OutIdx/Ratio); - } - } else { - // Untested so far. - break; - - // If there are more elements in the source than there are in the result, - // then an input element is live if the corresponding output element is - // live. - Ratio = InVWidth/VWidth; + InputDemandedElts.setBit(OutIdx / Ratio); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an input element is live if the + // corresponding output element is live. + Ratio = InVWidth / VWidth; for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (DemandedElts[InIdx/Ratio]) + if (DemandedElts[InIdx / Ratio]) InputDemandedElts.setBit(InIdx); + } else { + // Unsupported so far. + break; } // div/rem demand all inputs, because they don't want divide by zero. @@ -1122,24 +1125,26 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, MadeChange = true; } - UndefElts = UndefElts2; - if (VWidth > InVWidth) { - llvm_unreachable("Unimp"); - // If there are more elements in the result than there are in the source, - // then an output element is undef if the corresponding input element is - // undef. + if (VWidth == InVWidth) { + UndefElts = UndefElts2; + } else if ((VWidth % InVWidth) == 0) { + // If the number of elements in the output is a multiple of the number of + // elements in the input then an output element is undef if the + // corresponding input element is undef. for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) - if (UndefElts2[OutIdx/Ratio]) + if (UndefElts2[OutIdx / Ratio]) + UndefElts.setBit(OutIdx); + } else if ((InVWidth % VWidth) == 0) { + // If the number of elements in the input is a multiple of the number of + // elements in the output then an output element is undef if all of the + // corresponding input elements are undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio); + if (SubUndef.countPopulation() == Ratio) UndefElts.setBit(OutIdx); - } else if (VWidth < InVWidth) { + } + } else { llvm_unreachable("Unimp"); - // If there are more elements in the source than there are in the result, - // then a result element is undef if all of the corresponding input - // elements are undef. - UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. - for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) - if (!UndefElts2[InIdx]) // Not undef? - UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. } break; } @@ -1237,6 +1242,15 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + + // SSE4A instructions leave the upper 64-bits of the 128-bit result + // in an undefined state. + case Intrinsic::x86_sse4a_extrq: + case Intrinsic::x86_sse4a_extrqi: + case Intrinsic::x86_sse4a_insertq: + case Intrinsic::x86_sse4a_insertqi: + UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2); + break; } break; } diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 2730472..e25639a 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -22,10 +22,10 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -/// CheapToScalarize - Return true if the value is cheaper to scalarize than it -/// is to leave as a vector operation. isConstant indicates whether we're -/// extracting one known element. If false we're extracting a variable index. -static bool CheapToScalarize(Value *V, bool isConstant) { +/// Return true if the value is cheaper to scalarize than it is to leave as a +/// vector operation. isConstant indicates whether we're extracting one known +/// element. If false we're extracting a variable index. +static bool cheapToScalarize(Value *V, bool isConstant) { if (Constant *C = dyn_cast<Constant>(V)) { if (isConstant) return true; @@ -50,13 +50,13 @@ static bool CheapToScalarize(Value *V, bool isConstant) { return true; if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) if (BO->hasOneUse() && - (CheapToScalarize(BO->getOperand(0), isConstant) || - CheapToScalarize(BO->getOperand(1), isConstant))) + (cheapToScalarize(BO->getOperand(0), isConstant) || + cheapToScalarize(BO->getOperand(1), isConstant))) return true; if (CmpInst *CI = dyn_cast<CmpInst>(I)) if (CI->hasOneUse() && - (CheapToScalarize(CI->getOperand(0), isConstant) || - CheapToScalarize(CI->getOperand(1), isConstant))) + (cheapToScalarize(CI->getOperand(0), isConstant) || + cheapToScalarize(CI->getOperand(1), isConstant))) return true; return false; @@ -82,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { // and that it is a binary operation which is cheap to scalarize. // otherwise return NULL. if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) || - !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true)) + !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true)) return nullptr; // Create a scalar PHI node that will replace the vector PHI node @@ -115,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) { Instruction *pos = dyn_cast<Instruction>(PHIInVal); BasicBlock::iterator InsertPos; if (pos && !isa<PHINode>(pos)) { - InsertPos = pos; - ++InsertPos; + InsertPos = ++pos->getIterator(); } else { InsertPos = inBB->getFirstInsertionPt(); } @@ -137,7 +136,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // If vector val is constant with all elements the same, replace EI with // that element. We handle a known element # below. if (Constant *C = dyn_cast<Constant>(EI.getOperand(0))) - if (CheapToScalarize(C, false)) + if (cheapToScalarize(C, false)) return ReplaceInstUsesWith(EI, C->getAggregateElement(0U)); // If extracting a specified index from the vector, see if we can recursively @@ -163,7 +162,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { } } - // If the this extractelement is directly using a bitcast from a vector of + // If this extractelement is directly using a bitcast from a vector of // the same number of elements, see if we can find the source element from // it. In this case, we will end up needing to bitcast the scalars. if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { @@ -184,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) { // Push extractelement into predecessor operation if legal and - // profitable to do so + // profitable to do so. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { if (I->hasOneUse() && - CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { + cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { Value *newEI0 = Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1), EI.getName()+".lhs"); @@ -230,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { SrcIdx, false)); } } else if (CastInst *CI = dyn_cast<CastInst>(I)) { - // Canonicalize extractelement(cast) -> cast(extractelement) - // bitcasts can change the number of vector elements and they cost nothing + // Canonicalize extractelement(cast) -> cast(extractelement). + // Bitcasts can change the number of vector elements, and they cost + // nothing. if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) { Value *EE = Builder->CreateExtractElement(CI->getOperand(0), EI.getIndexOperand()); @@ -245,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { // fight the vectorizer. // If we are extracting an element from a vector select or a select on - // vectors, a select on the scalars extracted from the vector arguments. + // vectors, create a select on the scalars extracted from the vector + // arguments. Value *TrueVal = SI->getTrueValue(); Value *FalseVal = SI->getFalseValue(); @@ -275,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { return nullptr; } -/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns -/// elements from either LHS or RHS, return the shuffle mask and true. -/// Otherwise, return false. -static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, +/// If V is a shuffle of values that ONLY returns elements from either LHS or +/// RHS, return the shuffle mask and true. Otherwise, return false. +static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, SmallVectorImpl<Constant*> &Mask) { assert(LHS->getType() == RHS->getType() && "Invalid CollectSingleShuffleElements"); @@ -315,7 +315,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted undef. Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext())); return true; @@ -330,7 +330,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { // We can handle this if the vector we are inserting into is // transitively ok. - if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { // If so, update the mask to reflect the inserted value. if (EI->getOperand(0) == LHS) { Mask[InsertedIdx % NumElts] = @@ -352,6 +352,48 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, return false; } +/// If we have insertion into a vector that is wider than the vector that we +/// are extracting from, try to widen the source vector to allow a single +/// shufflevector to replace one or more insert/extract pairs. +static void replaceExtractElements(InsertElementInst *InsElt, + ExtractElementInst *ExtElt, + InstCombiner &IC) { + VectorType *InsVecType = InsElt->getType(); + VectorType *ExtVecType = ExtElt->getVectorOperandType(); + unsigned NumInsElts = InsVecType->getVectorNumElements(); + unsigned NumExtElts = ExtVecType->getVectorNumElements(); + + // The inserted-to vector must be wider than the extracted-from vector. + if (InsVecType->getElementType() != ExtVecType->getElementType() || + NumExtElts >= NumInsElts) + return; + + // Create a shuffle mask to widen the extended-from vector using undefined + // values. The mask selects all of the values of the original vector followed + // by as many undefined values as needed to create a vector of the same length + // as the inserted-to vector. + SmallVector<Constant *, 16> ExtendMask; + IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); + for (unsigned i = 0; i < NumExtElts; ++i) + ExtendMask.push_back(ConstantInt::get(IntType, i)); + for (unsigned i = NumExtElts; i < NumInsElts; ++i) + ExtendMask.push_back(UndefValue::get(IntType)); + + Value *ExtVecOp = ExtElt->getVectorOperand(); + auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), + ConstantVector::get(ExtendMask)); + + // Replace all extracts from the original narrow vector with extracts from + // the new wide vector. + WideVec->insertBefore(ExtElt); + for (User *U : ExtVecOp->users()) { + if (ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U)) { + auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); + NewExt->insertAfter(WideVec); + IC.ReplaceInstUsesWith(*OldExt, NewExt); + } + } +} /// We are building a shuffle to create V, which is a sequence of insertelement, /// extractelement pairs. If PermittedRHS is set, then we must either use it or @@ -363,9 +405,10 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, /// often been chosen carefully to be efficiently implementable on the target. typedef std::pair<Value *, Value *> ShuffleOps; -static ShuffleOps CollectShuffleElements(Value *V, +static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<Constant *> &Mask, - Value *PermittedRHS) { + Value *PermittedRHS, + InstCombiner &IC) { assert(V->getType()->isVectorTy() && "Invalid shuffle!"); unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); @@ -396,10 +439,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // otherwise we'd end up with a shuffle of three inputs. if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) { Value *RHS = EI->getOperand(0); - ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS); + ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC); assert(LR.second == nullptr || LR.second == RHS); if (LR.first->getType() != RHS->getType()) { + // Although we are giving up for now, see if we can create extracts + // that match the inserts for another round of combining. + replaceExtractElements(IEI, EI, IC); + // We tried our best, but we can't find anything compatible with RHS // further up the chain. Return a trivial shuffle. for (unsigned i = 0; i < NumElts; ++i) @@ -429,14 +476,14 @@ static ShuffleOps CollectShuffleElements(Value *V, // If this insertelement is a chain that comes from exactly these two // vectors, return the vector and the effective shuffle. if (EI->getOperand(0)->getType() == PermittedRHS->getType() && - CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, + collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS, Mask)) return std::make_pair(EI->getOperand(0), PermittedRHS); } } } - // Otherwise, can't do anything fancy. Return an identity vector. + // Otherwise, we can't do anything fancy. Return an identity vector. for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); return std::make_pair(V, nullptr); @@ -512,7 +559,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { // (and any insertelements it points to), into one big shuffle. if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) { SmallVector<Constant*, 16> Mask; - ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr); + ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this); // The proposed shuffle may be trivial, in which case we shouldn't // perform the combine. @@ -588,8 +635,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::GetElementPtr: { - for (int i = 0, e = I->getNumOperands(); i != e; ++i) { - if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1)) + for (Value *Operand : I->operands()) { + if (!CanEvaluateShuffled(Operand, Mask, Depth-1)) return false; } return true; @@ -617,7 +664,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask, /// Rebuild a new instruction just like 'I' but with the new operands given. /// In the event of type mismatch, the type of the operands is correct. -static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) { +static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) { // We don't want to use the IRBuilder here because we want the replacement // instructions to appear next to 'I', not the builder's insertion point. switch (I->getOpcode()) { @@ -760,7 +807,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { NeedsRebuild |= (V != I->getOperand(i)); } if (NeedsRebuild) { - return BuildNew(I, NewOps); + return buildNew(I, NewOps); } return I; } @@ -792,7 +839,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } -static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask, +static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, bool &isLHSID, bool &isRHSID) { isLHSID = isRHSID = true; @@ -891,7 +938,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (VWidth == LHSWidth) { // Analyze the shuffle, are the LHS or RHS and identity shuffles? bool isLHSID, isRHSID; - RecognizeIdentityMask(Mask, isLHSID, isRHSID); + recognizeIdentityMask(Mask, isLHSID, isRHSID); // Eliminate identity shuffles. if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); @@ -1177,7 +1224,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { // If the result mask is an identity, replace uses of this instruction with // corresponding argument. bool isLHSID, isRHSID; - RecognizeIdentityMask(newMask, isLHSID, isRHSID); + recognizeIdentityMask(newMask, isLHSID, isRHSID); if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS); if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS); diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fd34a24..7c46cfd 100644 --- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,8 +42,9 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/LibCallSemantics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -79,14 +80,12 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) { return llvm::EmitGEPOffset(Builder, DL, GEP); } -/// ShouldChangeType - Return true if it is desirable to convert a computation -/// from 'From' to 'To'. We don't want to convert from a legal to an illegal -/// type for example, or from a smaller to a larger illegal type. -bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { - assert(From->isIntegerTy() && To->isIntegerTy()); - - unsigned FromWidth = From->getPrimitiveSizeInBits(); - unsigned ToWidth = To->getPrimitiveSizeInBits(); +/// Return true if it is desirable to convert an integer computation from a +/// given bit width to a new bit width. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(unsigned FromWidth, + unsigned ToWidth) const { bool FromLegal = DL.isLegalInteger(FromWidth); bool ToLegal = DL.isLegalInteger(ToWidth); @@ -103,6 +102,17 @@ bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { return true; } +/// Return true if it is desirable to convert a computation from 'From' to 'To'. +/// We don't want to convert from a legal to an illegal type for example or from +/// a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { + assert(From->isIntegerTy() && To->isIntegerTy()); + + unsigned FromWidth = From->getPrimitiveSizeInBits(); + unsigned ToWidth = To->getPrimitiveSizeInBits(); + return ShouldChangeType(FromWidth, ToWidth); +} + // Return true, if No Signed Wrap should be maintained for I. // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", // where both B and C should be ConstantInts, results in a constant that does @@ -156,27 +166,26 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) { I.setFastMathFlags(FMF); } -/// SimplifyAssociativeOrCommutative - This performs a few simplifications for -/// operators which are associative or commutative: -// -// Commutative operators: -// -// 1. Order operands such that they are listed from right (least complex) to -// left (most complex). This puts constants before unary operators before -// binary operators. -// -// Associative operators: -// -// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. -// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. -// -// Associative and commutative operators: -// -// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. -// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. -// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" -// if C1 and C2 are constants. -// +/// This performs a few simplifications for operators that are associative or +/// commutative: +/// +/// Commutative operators: +/// +/// 1. Order operands such that they are listed from right (least complex) to +/// left (most complex). This puts constants before unary operators before +/// binary operators. +/// +/// Associative operators: +/// +/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +/// +/// Associative and commutative operators: +/// +/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +/// if C1 and C2 are constants. bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { Instruction::BinaryOps Opcode = I.getOpcode(); bool Changed = false; @@ -322,7 +331,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { } while (1); } -/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to +/// Return whether "X LOp (Y ROp Z)" is always equal to /// "(X LOp Y) ROp (X LOp Z)". static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -361,7 +370,7 @@ static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, } } -/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to +/// Return whether "(X LOp Y) ROp Z" is always equal to /// "(X ROp Z) LOp (Y ROp Z)". static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, Instruction::BinaryOps ROp) { @@ -519,7 +528,7 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, if (isa<OverflowingBinaryOperator>(Op1)) HasNSW &= Op1->hasNoSignedWrap(); - // We can propogate 'nsw' if we know that + // We can propagate 'nsw' if we know that // %Y = mul nsw i16 %X, C // %Z = add nsw i16 %Y, %X // => @@ -537,11 +546,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder, return SimplifiedInst; } -/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations -/// which some other binary operation distributes over either by factorizing -/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this -/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is -/// a win). Returns the simplified value, or null if it didn't simplify. +/// This tries to simplify binary operations which some other binary operation +/// distributes over either by factorizing out common terms +/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in +/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win). +/// Returns the simplified value, or null if it didn't simplify. Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); @@ -623,12 +632,38 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { } } + // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0)) + // (op (select (a, b, c)), (select (a, b, d))) -> (select (a, 0, (op c, d))) + if (auto *SI0 = dyn_cast<SelectInst>(LHS)) { + if (auto *SI1 = dyn_cast<SelectInst>(RHS)) { + if (SI0->getCondition() == SI1->getCondition()) { + Value *SI = nullptr; + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect(SI0->getCondition(), + Builder->CreateBinOp(TopLevelOpcode, + SI0->getTrueValue(), + SI1->getTrueValue()), + V); + if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(), + SI1->getTrueValue(), DL, TLI, DT, AC)) + SI = Builder->CreateSelect( + SI0->getCondition(), V, + Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(), + SI1->getFalseValue())); + if (SI) { + SI->takeName(&I); + return SI; + } + } + } + } + return nullptr; } -// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction -// if the LHS is a constant zero (which is the 'negate' form). -// +/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a +/// constant zero (which is the 'negate' form). Value *InstCombiner::dyn_castNegVal(Value *V) const { if (BinaryOperator::isNeg(V)) return BinaryOperator::getNegArgument(V); @@ -644,10 +679,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const { return nullptr; } -// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the -// instruction if the LHS is a constant negative zero (which is the 'negate' -// form). -// +/// Given a 'fsub' instruction, return the RHS of the instruction if the LHS is +/// a constant negative zero (which is the 'negate' form). Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const { if (BinaryOperator::isFNeg(V, IgnoreZeroSign)) return BinaryOperator::getFNegArgument(V); @@ -700,10 +733,10 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, llvm_unreachable("Unknown binary instruction type!"); } -// FoldOpIntoSelect - Given an instruction with a select as one operand and a -// constant as the other operand, try to fold the binary operator into the -// select arguments. This also works for Cast instructions, which obviously do -// not have a second operand. +/// Given an instruction with a select as one operand and a constant as the +/// other operand, try to fold the binary operator into the select arguments. +/// This also works for Cast instructions, which obviously do not have a second +/// operand. Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { // Don't modify shared select instructions if (!SI->hasOneUse()) return nullptr; @@ -752,10 +785,9 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { return nullptr; } -/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which -/// has a PHI node as operand #0, see if we can fold the instruction into the -/// PHI (which is only possible if all operands to the PHI are constants). -/// +/// Given a binary operator, cast instruction, or select which has a PHI node as +/// operand #0, see if we can fold the instruction into the PHI (which is only +/// possible if all operands to the PHI are constants). Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *PN = cast<PHINode>(I.getOperand(0)); unsigned NumPHIValues = PN->getNumIncomingValues(); @@ -819,7 +851,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { NewPN->takeName(PN); // If we are going to have to insert a new computation, do so right before the - // predecessors terminator. + // predecessor's terminator. if (NonConstBB) Builder->SetInsertPoint(NonConstBB->getTerminator()); @@ -893,10 +925,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { return ReplaceInstUsesWith(I, NewPN); } -/// FindElementAtOffset - Given a pointer type and a constant offset, determine -/// whether or not there is a sequence of GEP indices into the pointed type that -/// will land us at the specified offset. If so, fill them into NewIndices and -/// return the resultant element type, otherwise return null. +/// Given a pointer type and a constant offset, determine whether or not there +/// is a sequence of GEP indices into the pointed type that will land us at the +/// specified offset. If so, fill them into NewIndices and return the resultant +/// element type, otherwise return null. Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset, SmallVectorImpl<Value *> &NewIndices) { Type *Ty = PtrTy->getElementType(); @@ -965,8 +997,8 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) { return true; } -/// Descale - Return a value X such that Val = X * Scale, or null if none. If -/// the multiplication is known not to overflow then NoSignedWrap is set. +/// Return a value X such that Val = X * Scale, or null if none. +/// If the multiplication is known not to overflow, then NoSignedWrap is set. Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!"); assert(cast<IntegerType>(Val->getType())->getBitWidth() == @@ -1008,11 +1040,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { // 0'th operand of Val. std::pair<Instruction*, unsigned> Parent; - // RequireNoSignedWrap - Set if the transform requires a descaling at deeper - // levels that doesn't overflow. + // Set if the transform requires a descaling at deeper levels that doesn't + // overflow. bool RequireNoSignedWrap = false; - // logScale - log base 2 of the scale. Negative if not a power of 2. + // Log base 2 of the scale. Negative if not a power of 2. int32_t logScale = Scale.exactLogBase2(); for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down @@ -1213,16 +1245,11 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) { /// specified one but with other operands. static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS, InstCombiner::BuilderTy *B) { - Value *BORes = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); - if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BORes)) { - if (isa<OverflowingBinaryOperator>(NewBO)) { - NewBO->setHasNoSignedWrap(Inst.hasNoSignedWrap()); - NewBO->setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap()); - } - if (isa<PossiblyExactOperator>(NewBO)) - NewBO->setIsExact(Inst.isExact()); - } - return BORes; + Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS); + // If LHS and RHS are constant, BO won't be a binary operator. + if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO)) + NewBO->copyIRFlags(&Inst); + return BO; } /// \brief Makes transformation of binary operation specific for vector types. @@ -1256,9 +1283,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { LShuf->getMask() == RShuf->getMask()) { Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0), RShuf->getOperand(0), Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask()); - return Res; } } @@ -1294,18 +1320,11 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) { } if (MayChange) { Constant *C2 = ConstantVector::get(C2M); - Value *NewLHS, *NewRHS; - if (isa<Constant>(LHS)) { - NewLHS = C2; - NewRHS = Shuffle->getOperand(0); - } else { - NewLHS = Shuffle->getOperand(0); - NewRHS = C2; - } + Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0); + Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2; Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder); - Value *Res = Builder->CreateShuffleVector(NewBO, + return Builder->CreateShuffleVector(NewBO, UndefValue::get(Inst.getType()), Shuffle->getMask()); - return Res; } } @@ -1323,7 +1342,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Eliminate unneeded casts for indices, and replace indices which displace // by multiples of a zero size type with zero. bool MadeChange = false; - Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType()); + Type *IntPtrTy = + DL.getIntPtrType(GEP.getPointerOperandType()->getScalarType()); gep_type_iterator GTI = gep_type_begin(GEP); for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E; @@ -1333,21 +1353,25 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!SeqTy) continue; + // Index type should have the same width as IntPtr + Type *IndexTy = (*I)->getType(); + Type *NewIndexType = IndexTy->isVectorTy() ? + VectorType::get(IntPtrTy, IndexTy->getVectorNumElements()) : IntPtrTy; + // If the element type has zero size then any index over it is equivalent // to an index of zero, so replace it with zero if it is not zero already. if (SeqTy->getElementType()->isSized() && DL.getTypeAllocSize(SeqTy->getElementType()) == 0) if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { - *I = Constant::getNullValue(IntPtrTy); + *I = Constant::getNullValue(NewIndexType); MadeChange = true; } - Type *IndexTy = (*I)->getType(); - if (IndexTy != IntPtrTy) { + if (IndexTy != NewIndexType) { // If we are using a wider index than needed for this platform, shrink // it to what we need. If narrower, sign-extend it to what we need. // This explicit cast can make subsequent optimizations more obvious. - *I = Builder->CreateIntCast(*I, IntPtrTy, true); + *I = Builder->CreateIntCast(*I, NewIndexType, true); MadeChange = true; } } @@ -1421,8 +1445,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } } - GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); + // If not all GEPs are identical we'll have to create a new PHI node. + // Check that the old PHI node has only one use so that it will get + // removed. + if (DI != -1 && !PN->hasOneUse()) + return nullptr; + GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(Op1->clone()); if (DI == -1) { // All the GEPs feeding the PHI are identical. Clone one down into our // BB so that it can be merged with the current GEP. @@ -1432,11 +1461,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // All the GEPs feeding the PHI differ at a single offset. Clone a GEP // into the current block so it can be merged, and create a new PHI to // set that index. - Instruction *InsertPt = Builder->GetInsertPoint(); - Builder->SetInsertPoint(PN); - PHINode *NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), - PN->getNumOperands()); - Builder->SetInsertPoint(InsertPt); + PHINode *NewPN; + { + IRBuilderBase::InsertPointGuard Guard(*Builder); + Builder->SetInsertPoint(PN); + NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(), + PN->getNumOperands()); + } for (auto &I : PN->operands()) NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI), @@ -1790,7 +1821,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { I->takeName(BCI); - BCI->getParent()->getInstList().insert(BCI, I); + BCI->getParent()->getInstList().insert(BCI->getIterator(), I); ReplaceInstUsesWith(*BCI, I); } return &GEP; @@ -1931,7 +1962,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { // Replace invoke with a NOP intrinsic to maintain the original CFG - Module *M = II->getParent()->getParent()->getParent(); + Module *M = II->getModule(); Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), None, "", II->getParent()); @@ -2280,9 +2311,10 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { } if (LoadInst *L = dyn_cast<LoadInst>(Agg)) // If the (non-volatile) load only has one use, we can rewrite this to a - // load from a GEP. This reduces the size of the load. - // FIXME: If a load is used only by extractvalue instructions then this - // could be done regardless of having multiple uses. + // load from a GEP. This reduces the size of the load. If a load is used + // only by extractvalue instructions then this either must have been + // optimized before, or it is a struct with padding, in which case we + // don't want to do the transformation as it loses padding knowledge. if (L->isSimple() && L->hasOneUse()) { // extractvalue has integer indices, getelementptr has Value*s. Convert. SmallVector<Value*, 4> Indices; @@ -2294,7 +2326,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // We need to insert these at the location of the old load, not at that of // the extractvalue. - Builder->SetInsertPoint(L->getParent(), L); + Builder->SetInsertPoint(L); Value *GEP = Builder->CreateInBoundsGEP(L->getType(), L->getPointerOperand(), Indices); // Returning the load directly will cause the main loop to insert it in @@ -2312,7 +2344,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { return nullptr; } -/// isCatchAll - Return 'true' if the given typeinfo will match anything. +/// Return 'true' if the given typeinfo will match anything. static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { switch (Personality) { case EHPersonality::GNU_C: @@ -2330,6 +2362,7 @@ static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) { case EHPersonality::MSVC_X86SEH: case EHPersonality::MSVC_Win64SEH: case EHPersonality::MSVC_CXX: + case EHPersonality::CoreCLR: return TypeInfo->isNullValue(); } llvm_unreachable("invalid enum"); @@ -2441,10 +2474,24 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { SawCatchAll = true; break; } - if (AlreadyCaught.count(TypeInfo)) - // Already caught by an earlier clause, so having it in the filter - // is pointless. - continue; + + // Even if we've seen a type in a catch clause, we don't want to + // remove it from the filter. An unexpected type handler may be + // set up for a call site which throws an exception of the same + // type caught. In order for the exception thrown by the unexpected + // handler to propogate correctly, the filter must be correctly + // described for the call site. + // + // Example: + // + // void unexpected() { throw 1;} + // void foo() throw (int) { + // std::set_unexpected(unexpected); + // try { + // throw 2.0; + // } catch (int i) {} + // } + // There is no point in having multiple copies of the same typeinfo in // a filter, so only add it if we didn't already. if (SeenInFilter.insert(TypeInfo).second) @@ -2637,15 +2684,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } -/// TryToSinkInstruction - Try to move the specified instruction from its -/// current block into the beginning of DestBlock, which can only happen if it's -/// safe to move the instruction past all of the instructions between it and the -/// end of its block. +/// Try to move the specified instruction from its current block into the +/// beginning of DestBlock, which can only happen if it's safe to move the +/// instruction past all of the instructions between it and the end of its +/// block. static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { assert(I->hasOneUse() && "Invariants didn't hold!"); // Cannot move control-flow-involving, volatile loads, vaarg, etc. - if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() || + if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() || isa<TerminatorInst>(I)) return false; @@ -2654,17 +2701,24 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { &DestBlock->getParent()->getEntryBlock()) return false; + // Do not sink convergent call instructions. + if (auto *CI = dyn_cast<CallInst>(I)) { + if (CI->isConvergent()) + return false; + } + // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { - for (BasicBlock::iterator Scan = I, E = I->getParent()->end(); + for (BasicBlock::iterator Scan = I->getIterator(), + E = I->getParent()->end(); Scan != E; ++Scan) if (Scan->mayWriteToMemory()) return false; } BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt(); - I->moveBefore(InsertPos); + I->moveBefore(&*InsertPos); ++NumSunkInst; return true; } @@ -2698,6 +2752,27 @@ bool InstCombiner::run() { } } + // In general, it is possible for computeKnownBits to determine all bits in a + // value even when the operands are not all constants. + if (!I->use_empty() && I->getType()->isIntegerTy()) { + unsigned BitWidth = I->getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I); + if ((KnownZero | KnownOne).isAllOnesValue()) { + Constant *C = ConstantInt::get(I->getContext(), KnownOne); + DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C << + " from: " << *I << '\n'); + + // Add operands to the worklist. + ReplaceInstUsesWith(*I, C); + ++NumConstProp; + EraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + } + // See if we can trivially sink this instruction to a successor basic block. if (I->hasOneUse()) { BasicBlock *BB = I->getParent(); @@ -2738,7 +2813,7 @@ bool InstCombiner::run() { } // Now that we have an instruction, try combining it to simplify it. - Builder->SetInsertPoint(I->getParent(), I); + Builder->SetInsertPoint(I); Builder->SetCurrentDebugLocation(I->getDebugLoc()); #ifndef NDEBUG @@ -2768,7 +2843,7 @@ bool InstCombiner::run() { // Insert the new instruction into the basic block... BasicBlock *InstParent = I->getParent(); - BasicBlock::iterator InsertPos = I; + BasicBlock::iterator InsertPos = I->getIterator(); // If we replace a PHI with something that isn't a PHI, fix up the // insertion point. @@ -2801,8 +2876,8 @@ bool InstCombiner::run() { return MadeIRChange; } -/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding -/// all reachable code to the worklist. +/// Walk the function in depth-first order, adding all reachable code to the +/// worklist. /// /// This has a couple of tricks to make the code faster and more powerful. In /// particular, we constant fold and DCE instructions as we go, to avoid adding @@ -2829,7 +2904,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, continue; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst, TLI)) { @@ -2900,8 +2975,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, } } - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - Worklist.push_back(TI->getSuccessor(i)); + for (BasicBlock *SuccBB : TI->successors()) + Worklist.push_back(SuccBB); } while (!Worklist.empty()); // Once we've found all of the instructions to add to instcombine's worklist, @@ -2909,8 +2984,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, // of the function down. This jives well with the way that it adds all uses // of instructions to the worklist after doing a transformation, thus avoiding // some N^2 behavior in pathological cases. - ICWorklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], - InstrsForInstCombineWorklist.size()); + ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist); return MadeIRChange; } @@ -2930,13 +3004,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, // track of which blocks we visit. SmallPtrSet<BasicBlock *, 64> Visited; MadeIRChange |= - AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI); + AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI); // Do a quick scan over the function. If we find any blocks that are // unreachable, remove any instructions inside of them. This prevents // the instcombine code from having to deal with some bad special cases. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (Visited.count(BB)) + if (Visited.count(&*BB)) continue; // Delete the instructions backwards, as it has a reduced likelihood of @@ -2944,11 +3018,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; - if (!Inst->use_empty()) + Instruction *Inst = &*--EndInst->getIterator(); + if (!Inst->use_empty() && !Inst->getType()->isTokenTy()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad()) { EndInst = Inst; continue; } @@ -2956,7 +3029,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL, ++NumDeadInst; MadeIRChange = true; } - Inst->eraseFromParent(); + if (!Inst->getType()->isTokenTy()) + Inst->eraseFromParent(); } } @@ -2968,8 +3042,6 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, LoopInfo *LI = nullptr) { - // Minimizing size? - bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize); auto &DL = F.getParent()->getDataLayout(); /// Builder - This is an IRBuilder that automatically inserts new @@ -2992,7 +3064,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, if (prepareICWorklistFromFunction(F, DL, &TLI, Worklist)) Changed = true; - InstCombiner IC(Worklist, &Builder, MinimizeSize, + InstCombiner IC(Worklist, &Builder, F.optForMinSize(), AA, &AC, &TLI, &DT, DL, LI); if (IC.run()) Changed = true; @@ -3046,11 +3118,12 @@ public: void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } bool InstructionCombiningPass::runOnFunction(Function &F) { @@ -3058,7 +3131,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { return false; // Required analyses. - auto AA = &getAnalysis<AliasAnalysis>(); + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -3076,7 +3149,8 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) |