summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp266
1 files changed, 245 insertions, 21 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5..a76f080 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -9,8 +9,8 @@
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
@@ -20,6 +20,23 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64tti"
+static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
+ cl::init(true), cl::Hidden);
+
+bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // Inline a callee if its target-features are a subset of the callers
+ // target-features.
+ return (CallerBits & CalleeBits) == CalleeBits;
+}
+
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
@@ -176,10 +193,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+ ArrayRef<const Value *> Args) {
+
+ // A helper that returns a vector type from the given type. The number of
+ // elements in type Ty determine the vector width.
+ auto toVectorTy = [&](Type *ArgTy) {
+ return VectorType::get(ArgTy->getScalarType(),
+ DstTy->getVectorNumElements());
+ };
+
+ // Exit early if DstTy is not a vector type whose elements are at least
+ // 16-bits wide.
+ if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+ return false;
+
+ // Determine if the operation has a widening variant. We consider both the
+ // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+ // instructions.
+ //
+ // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+ // verify that their extending operands are eliminated during code
+ // generation.
+ switch (Opcode) {
+ case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+ case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ break;
+ default:
+ return false;
+ }
+
+ // To be a widening instruction (either the "wide" or "long" versions), the
+ // second operand must be a sign- or zero extend having a single user. We
+ // only consider extends having a single user because they may otherwise not
+ // be eliminated.
+ if (Args.size() != 2 ||
+ (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+ !Args[1]->hasOneUse())
+ return false;
+ auto *Extend = cast<CastInst>(Args[1]);
+
+ // Legalize the destination type and ensure it can be used in a widening
+ // operation.
+ auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+ unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+ if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+ return false;
+
+ // Legalize the source type and ensure it can be used in a widening
+ // operation.
+ Type *SrcTy = toVectorTy(Extend->getSrcTy());
+ auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+ unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+ if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+ return false;
+
+ // Get the total number of vector elements in the legalized types.
+ unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+ unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+ // Return true if the legalized types have the same number of vector elements
+ // and the destination element type size is twice that of the source type.
+ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // If the cast is observable, and it is used by a widening instruction (e.g.,
+ // uaddl, saddw, etc.), it may be free.
+ if (I && I->hasOneUse()) {
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
+ SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+ if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+ // If the cast is the second operand, it is free. We will generate either
+ // a "wide" or "long" version of the widening instruction.
+ if (I == SingleUser->getOperand(1))
+ return 0;
+ // If the cast is not the second operand, it will be free if it looks the
+ // same as the second operand. In this case, we will generate a "long"
+ // version of the widening instruction.
+ if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+ if (I->getOpcode() == Cast->getOpcode() &&
+ cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+ return 0;
+ }
+ }
+
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@@ -378,6 +480,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+ // add in the widening overhead specified by the sub-target. Since the
+ // extends feeding widening instructions are performed automatically, they
+ // aren't present in the generated code and have a zero cost. By adding a
+ // widening overhead here, we attach the total cost of the combined operation
+ // to the widening instruction.
+ int Cost = 0;
+ if (isWideningInstruction(Ty, Opcode, Args))
+ Cost += ST->getWideningBaseCost();
+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (ISD == ISD::SDIV &&
@@ -387,9 +499,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -404,8 +516,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
switch (ISD) {
default:
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -413,7 +525,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
- return 1 * LT.first;
+ return (Cost + 1) * LT.first;
}
}
@@ -436,7 +548,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+ Type *CondTy, const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
@@ -463,11 +575,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return Entry->Cost;
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
- unsigned Alignment, unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ const Instruction *I) {
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -505,12 +618,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
- Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
- unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+ auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
- if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
- return Factor;
+ // Accesses having vector types that are a multiple of 128 bits can be
+ // matched to more than one ldN/stN instruction.
+ if (NumElts % Factor == 0 &&
+ TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+ return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -533,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return ST->getMaxInterleaveFactor();
}
-void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+// For Falkor, we want to avoid having too many strided loads in a loop since
+// that can exhaust the HW prefetcher resources. We adjust the unroller
+// MaxCount preference below to attempt to ensure unrolling doesn't create too
+// many strided loads.
+static void
+getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ enum { MaxStridedLoads = 7 };
+ auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
+ int StridedLoads = 0;
+ // FIXME? We could make this more precise by looking at the CFG and
+ // e.g. not counting loads in each side of an if-then-else diamond.
+ for (const auto BB : L->blocks()) {
+ for (auto &I : *BB) {
+ LoadInst *LMemI = dyn_cast<LoadInst>(&I);
+ if (!LMemI)
+ continue;
+
+ Value *PtrValue = LMemI->getPointerOperand();
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE.getSCEV(PtrValue);
+ const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+ continue;
+
+ // FIXME? We could take pairing of unrolled load copies into account
+ // by looking at the AddRec, but we would probably have to limit this
+ // to loops with no stores or other memory optimization barriers.
+ ++StridedLoads;
+ // We've seen enough strided loads that seeing more won't make a
+ // difference.
+ if (StridedLoads > MaxStridedLoads / 2)
+ return StridedLoads;
+ }
+ }
+ return StridedLoads;
+ };
+
+ int StridedLoads = countStridedLoads(L, SE);
+ DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+ << " strided loads\n");
+ // Pick the largest power of 2 unroll count that won't result in too many
+ // strided loads.
+ if (StridedLoads) {
+ UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
+ DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
+ << '\n');
+ }
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Enable partial unrolling and runtime unrolling.
- BaseT::getUnrollingPreferences(L, UP);
+ BaseT::getUnrollingPreferences(L, SE, UP);
// For inner loop, it is more likely to be a hot one, and the runtime check
// can be promoted out from LICM pass, so the overhead is less, let's try
@@ -546,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
+
+ if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+ EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
}
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
@@ -594,8 +765,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
- Info.IsSimple = true;
- Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(0);
break;
case Intrinsic::aarch64_neon_st2:
@@ -603,8 +772,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
- Info.IsSimple = true;
- Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
}
@@ -628,6 +795,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
return true;
}
+/// See if \p I should be considered for address type promotion. We check if \p
+/// I is a sext with right type and used in memory accesses. If it used in a
+/// "complex" getelementptr, we allow it to be promoted without finding other
+/// sext instructions that sign extended the same initial value. A getelementptr
+/// is considered as "complex" if it has more than 2 operands.
+bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
+ const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
+ bool Considerable = false;
+ AllowPromotionWithoutCommonHeader = false;
+ if (!isa<SExtInst>(&I))
+ return false;
+ Type *ConsideredSExtType =
+ Type::getInt64Ty(I.getParent()->getParent()->getContext());
+ if (I.getType() != ConsideredSExtType)
+ return false;
+ // See if the sext is the one with the right type and used in at least one
+ // GetElementPtrInst.
+ for (const User *U : I.users()) {
+ if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
+ Considerable = true;
+ // A getelementptr is considered as "complex" if it has more than 2
+ // operands. We will promote a SExt used in such complex GEP as we
+ // expect some computation to be merged if they are done on 64 bits.
+ if (GEPInst->getNumOperands() > 2) {
+ AllowPromotionWithoutCommonHeader = true;
+ break;
+ }
+ }
+ }
+ return Considerable;
+}
+
unsigned AArch64TTIImpl::getCacheLineSize() {
return ST->getCacheLineSize();
}
@@ -643,3 +842,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() {
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
return ST->getMaxPrefetchIterationsAhead();
}
+
+bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ switch (Opcode) {
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ return false;
+ case Instruction::Add:
+ return ScalarBits * Ty->getVectorNumElements() >= 128;
+ case Instruction::ICmp:
+ return (ScalarBits < 64) &&
+ (ScalarBits * Ty->getVectorNumElements() >= 128);
+ case Instruction::FCmp:
+ return Flags.NoNaN;
+ default:
+ llvm_unreachable("Unhandled reduction opcode");
+ }
+ return false;
+}
OpenPOWER on IntegriCloud