diff options
Diffstat (limited to 'contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h')
-rw-r--r-- | contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h | 328 |
1 files changed, 248 insertions, 80 deletions
diff --git a/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 7efdbcc..6331070 100644 --- a/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/contrib/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -17,11 +17,11 @@ #define LLVM_CODEGEN_BASICTTIIMPL_H #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfoImpl.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/Analysis/TargetLibraryInfo.h" namespace llvm { @@ -42,24 +42,6 @@ private: typedef TargetTransformInfoImplCRTPBase<T> BaseT; typedef TargetTransformInfo TTI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - assert(Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; - - for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { - if (Insert) - Cost += static_cast<T *>(this) - ->getVectorInstrCost(Instruction::InsertElement, Ty, i); - if (Extract) - Cost += static_cast<T *>(this) - ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); - } - - return Cost; - } - /// Estimate a cost of shuffle as a sequence of extract and insert /// operations. unsigned getPermuteShuffleOverhead(Type *Ty) { @@ -111,6 +93,13 @@ public: bool isSourceOfDivergence(const Value *V) { return false; } + bool isAlwaysUniform(const Value *V) { return false; } + + unsigned getFlatAddressSpace() { + // Return an invalid address space. + return -1; + } + bool isLegalAddImmediate(int64_t imm) { return getTLI()->isLegalAddImmediate(imm); } @@ -130,6 +119,10 @@ public: return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace); } + bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { + return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); + } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { TargetLoweringBase::AddrMode AM; @@ -162,6 +155,18 @@ public: return BaseT::getGEPCost(PointeeType, Ptr, Operands); } + int getExtCost(const Instruction *I, const Value *Src) { + if (getTLI()->isExtFree(I)) + return TargetTransformInfo::TCC_Free; + + if (isa<ZExtInst>(I) || isa<SExtInst>(I)) + if (const LoadInst *LI = dyn_cast<LoadInst>(Src)) + if (getTLI()->isExtLoad(LI, I, DL)) + return TargetTransformInfo::TCC_Free; + + return TargetTransformInfo::TCC_Basic; + } + unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) { return BaseT::getIntrinsicCost(IID, RetTy, Arguments); @@ -184,6 +189,62 @@ public: return BaseT::getIntrinsicCost(IID, RetTy, ParamTys); } + unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, + unsigned &JumpTableSize) { + /// Try to find the estimated number of clusters. Note that the number of + /// clusters identified in this function could be different from the actural + /// numbers found in lowering. This function ignore switches that are + /// lowered with a mix of jump table / bit test / BTree. This function was + /// initially intended to be used when estimating the cost of switch in + /// inline cost heuristic, but it's a generic cost model to be used in other + /// places (e.g., in loop unrolling). + unsigned N = SI.getNumCases(); + const TargetLoweringBase *TLI = getTLI(); + const DataLayout &DL = this->getDataLayout(); + + JumpTableSize = 0; + bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); + + // Early exit if both a jump table and bit test are not allowed. + if (N < 1 || (!IsJTAllowed && DL.getPointerSizeInBits() < N)) + return N; + + APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); + APInt MinCaseVal = MaxCaseVal; + for (auto CI : SI.cases()) { + const APInt &CaseVal = CI.getCaseValue()->getValue(); + if (CaseVal.sgt(MaxCaseVal)) + MaxCaseVal = CaseVal; + if (CaseVal.slt(MinCaseVal)) + MinCaseVal = CaseVal; + } + + // Check if suitable for a bit test + if (N <= DL.getPointerSizeInBits()) { + SmallPtrSet<const BasicBlock *, 4> Dests; + for (auto I : SI.cases()) + Dests.insert(I.getCaseSuccessor()); + + if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, + DL)) + return 1; + } + + // Check if suitable for a jump table. + if (IsJTAllowed) { + if (N < 2 || N < TLI->getMinimumJumpTableEntries()) + return N; + uint64_t Range = + (MaxCaseVal - MinCaseVal).getLimitedValue(UINT64_MAX - 1) + 1; + // Check whether a range of clusters is dense enough for a jump table + if (TLI->isSuitableForJumpTable(&SI, N, Range)) { + JumpTableSize = Range; + return 1; + } + } + return N; + } + unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); } unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); } @@ -228,7 +289,8 @@ public: unsigned getInliningThresholdMultiplier() { return 1; } - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP) { // This unrolling functionality is target independent, but to provide some // motivation for its intended use, for x86: @@ -299,7 +361,68 @@ public: unsigned getNumberOfRegisters(bool Vector) { return Vector ? 0 : 1; } - unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getRegisterBitWidth(bool Vector) const { return 32; } + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { + assert(Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += static_cast<T *>(this) + ->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += static_cast<T *>(this) + ->getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; + } + + /// Estimate the overhead of scalarizing an instructions unique + /// non-constant operands. The types of the arguments are ordinarily + /// scalar, in which case the costs are multiplied with VF. + unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, + unsigned VF) { + unsigned Cost = 0; + SmallPtrSet<const Value*, 4> UniqueOperands; + for (const Value *A : Args) { + if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { + Type *VecTy = nullptr; + if (A->getType()->isVectorTy()) { + VecTy = A->getType(); + // If A is a vector operand, VF should be 1 or correspond to A. + assert ((VF == 1 || VF == VecTy->getVectorNumElements()) && + "Vector argument does not match VF"); + } + else + VecTy = VectorType::get(A->getType(), VF); + + Cost += getScalarizationOverhead(VecTy, false, true); + } + } + + return Cost; + } + + unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) { + assert (VecTy->isVectorTy()); + + unsigned Cost = 0; + + Cost += getScalarizationOverhead(VecTy, true, false); + if (!Args.empty()) + Cost += getOperandsScalarizationOverhead(Args, + VecTy->getVectorNumElements()); + else + // When no information on arguments is provided, we add the cost + // associated with one argument as a heuristic. + Cost += getScalarizationOverhead(VecTy, false, true); + + return Cost; + } unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } @@ -317,7 +440,7 @@ public: std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - bool IsFloat = Ty->getScalarType()->isFloatingPointTy(); + bool IsFloat = Ty->isFPOrFPVectorTy(); // Assume that floating point arithmetic operations cost twice as much as // integer operations. unsigned OpCost = (IsFloat ? 2 : 1); @@ -341,10 +464,9 @@ public: unsigned Num = Ty->getVectorNumElements(); unsigned Cost = static_cast<T *>(this) ->getArithmeticInstrCost(Opcode, Ty->getScalarType()); - // return the cost of multiple scalar invocation plus the cost of - // inserting - // and extracting the values. - return getScalarizationOverhead(Ty, true, true) + Num * Cost; + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return getScalarizationOverhead(Ty, Args) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -360,7 +482,8 @@ public: return 1; } - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr) { const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -389,6 +512,18 @@ public: Dst->getPointerAddressSpace())) return 0; + // If this is a zext/sext of a load, return 0 if the corresponding + // extending load exists on target. + if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && + I && isa<LoadInst>(I->getOperand(0))) { + EVT ExtVT = EVT::getEVT(Dst); + EVT LoadVT = EVT::getEVT(Src); + unsigned LType = + ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); + if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) + return 0; + } + // If the cast is marked as legal (or promote) then assume low cost. if (SrcLT.first == DstLT.first && TLI->isOperationLegalOrPromote(ISD, DstLT.second)) @@ -446,14 +581,14 @@ public: Src->getVectorNumElements() / 2); T *TTI = static_cast<T *>(this); return TTI->getVectorSplitCost() + - (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc)); + (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I)); } // In other cases where the source or destination are illegal, assume // the operation will get scalarized. unsigned Num = Dst->getVectorNumElements(); unsigned Cost = static_cast<T *>(this)->getCastInstrCost( - Opcode, Dst->getScalarType(), Src->getScalarType()); + Opcode, Dst->getScalarType(), Src->getScalarType(), I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. @@ -487,7 +622,8 @@ public: return 0; } - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -515,7 +651,7 @@ public: if (CondTy) CondTy = CondTy->getScalarType(); unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost( - Opcode, ValTy->getScalarType(), CondTy); + Opcode, ValTy->getScalarType(), CondTy, I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. @@ -534,7 +670,7 @@ public: } unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I = nullptr) { assert(!Src->isVoidTy() && "Invalid type"); std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); @@ -680,18 +816,42 @@ public: return Cost; } - /// Get intrinsic cost based on arguments + /// Get intrinsic cost based on arguments. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF) { + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF = 1) { + unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); + assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + switch (IID) { default: { + // Assume that we need to scalarize this intrinsic. SmallVector<Type *, 4> Types; - for (Value *Op : Args) - Types.push_back(Op->getType()); - return static_cast<T *>(this)->getIntrinsicInstrCost(IID, RetTy, Types, - FMF); + for (Value *Op : Args) { + Type *OpTy = Op->getType(); + assert (VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = VectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = UINT_MAX; + if (RetVF > 1 || VF > 1) { + ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + return static_cast<T *>(this)-> + getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); } case Intrinsic::masked_scatter: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[3]; bool VarMask = !isa<Constant>(Mask); unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue(); @@ -702,6 +862,7 @@ public: Alignment); } case Intrinsic::masked_gather: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[2]; bool VarMask = !isa<Constant>(Mask); unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue(); @@ -713,19 +874,23 @@ public: } } - /// Get intrinsic cost based on argument types + /// Get intrinsic cost based on argument types. + /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the + /// arguments and the return value will be computed based on types. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF) { + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX) { SmallVector<unsigned, 2> ISDs; unsigned SingleCallCost = 10; // Library call cost. Make it expensive. switch (IID) { default: { // Assume that we need to scalarize this intrinsic. - unsigned ScalarizationCost = 0; + unsigned ScalarizationCost = ScalarizationCostPassed; unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); ScalarRetTy = RetTy->getScalarType(); } @@ -733,7 +898,8 @@ public: for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (Ty->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Ty, false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Ty, false, true); ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); Ty = Ty->getScalarType(); } @@ -881,7 +1047,8 @@ public: // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. if (RetTy->isVectorTy()) { - unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ? + ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false)); unsigned ScalarCalls = RetTy->getVectorNumElements(); SmallVector<Type *, 4> ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { @@ -894,7 +1061,8 @@ public: IID, RetTy->getScalarType(), ScalarTys, FMF); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); } } @@ -931,46 +1099,46 @@ public: return 0; } + /// Try to calculate arithmetic and shuffle op costs for reduction operations. + /// We're assuming that reduction operation are performing the following way: + /// 1. Non-pairwise reduction + /// %val1 = shufflevector<n x t> %val, <n x t> %undef, + /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> + /// \----------------v-------------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op <n x t> %val, <n x t> val1 + /// After this operation we have a vector %red1 where only the first n/2 + /// elements are meaningful, the second n/2 elements are undefined and can be + /// dropped. All other operations are actually working with the vector of + /// length n/2, not n, though the real vector length is still n. + /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, + /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> + /// \----------------v-------------/ \----------v------------/ + /// n/4 elements 3*n/4 elements + /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of + /// length n/2, the resulting vector has length n/4 etc. + /// 2. Pairwise reduction: + /// Everything is the same except for an additional shuffle operation which + /// is used to produce operands for pairwise kind of reductions. + /// %val1 = shufflevector<n x t> %val, <n x t> %undef, + /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %val2 = shufflevector<n x t> %val, <n x t> %undef, + /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> + /// \-------------v----------/ \----------v------------/ + /// n/2 elements n/2 elements + /// %red1 = op <n x t> %val1, <n x t> val2 + /// Again, the operation is performed on <n x t> vector, but the resulting + /// vector %red1 is <n/2 x t> vector. + /// + /// The cost model should take into account that the actual length of the + /// vector is reduced on each iteration. unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) { assert(Ty->isVectorTy() && "Expect a vector type"); Type *ScalarTy = Ty->getVectorElementType(); unsigned NumVecElts = Ty->getVectorNumElements(); unsigned NumReduxLevels = Log2_32(NumVecElts); - // Try to calculate arithmetic and shuffle op costs for reduction operations. - // We're assuming that reduction operation are performing the following way: - // 1. Non-pairwise reduction - // %val1 = shufflevector<n x t> %val, <n x t> %undef, - // <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> - // \----------------v-------------/ \----------v------------/ - // n/2 elements n/2 elements - // %red1 = op <n x t> %val, <n x t> val1 - // After this operation we have a vector %red1 with only maningfull the - // first n/2 elements, the second n/2 elements are undefined and can be - // dropped. All other operations are actually working with the vector of - // length n/2, not n. though the real vector length is still n. - // %val2 = shufflevector<n x t> %red1, <n x t> %undef, - // <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> - // \----------------v-------------/ \----------v------------/ - // n/4 elements 3*n/4 elements - // %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of - // length n/2, the resulting vector has length n/4 etc. - // 2. Pairwise reduction: - // Everything is the same except for an additional shuffle operation which - // is used to produce operands for pairwise kind of reductions. - // %val1 = shufflevector<n x t> %val, <n x t> %undef, - // <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> - // \-------------v----------/ \----------v------------/ - // n/2 elements n/2 elements - // %val2 = shufflevector<n x t> %val, <n x t> %undef, - // <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> - // \-------------v----------/ \----------v------------/ - // n/2 elements n/2 elements - // %red1 = op <n x t> %val1, <n x t> val2 - // Again, the operation is performed on <n x t> vector, but the resulting - // vector %red1 is <n/2 x t> vector. - // - // The cost model should take into account that the actual length of the - // vector is reduced on each iteration. unsigned ArithCost = 0; unsigned ShuffleCost = 0; auto *ConcreteTTI = static_cast<T *>(this); |