diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
82 files changed, 10118 insertions, 4265 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h index fd106a8..1dda746 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64.h @@ -22,12 +22,16 @@ namespace llvm { +class AArch64RegisterBankInfo; +class AArch64Subtarget; class AArch64TargetMachine; class FunctionPass; +class InstructionSelector; class MachineFunctionPass; FunctionPass *createAArch64DeadRegisterDefinitions(); FunctionPass *createAArch64RedundantCopyEliminationPass(); +FunctionPass *createAArch64CondBrTuning(); FunctionPass *createAArch64ConditionalCompares(); FunctionPass *createAArch64AdvSIMDScalar(); FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, @@ -38,19 +42,23 @@ FunctionPass *createAArch64LoadStoreOptimizationPass(); FunctionPass *createAArch64VectorByElementOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); -FunctionPass *createAArch64AddressTypePromotionPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); +FunctionPass *createFalkorHWPFFixPass(); +FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &, + AArch64Subtarget &, AArch64RegisterBankInfo &); void initializeAArch64A53Fix835769Pass(PassRegistry&); void initializeAArch64A57FPLoadBalancingPass(PassRegistry&); -void initializeAArch64AddressTypePromotionPass(PassRegistry&); void initializeAArch64AdvSIMDScalarPass(PassRegistry&); void initializeAArch64CollectLOHPass(PassRegistry&); +void initializeAArch64CondBrTuningPass(PassRegistry &); void initializeAArch64ConditionalComparesPass(PassRegistry&); void initializeAArch64ConditionOptimizerPass(PassRegistry&); void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); @@ -60,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); +void initializeFalkorHWPFFixPass(PassRegistry&); +void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 91c335f..436bf11 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", - "Enable cryptographic instructions">; + "Enable cryptographic instructions", [FeatureNEON]>; def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; @@ -38,6 +38,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; +def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", + "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; @@ -47,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension">; +def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", + "Enable Scalable Vector Extension (SVE) instructions">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -100,6 +106,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature< "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", "CPU fuses arithmetic + cbz/cbnz operations">; +def FeatureFuseAES : SubtargetFeature< + "fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; + +def FeatureFuseLiterals : SubtargetFeature< + "fuse-literals", "HasFuseLiterals", "true", + "CPU fuses literal generation operations">; + def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; @@ -108,12 +122,22 @@ def FeatureUseRSqrt : SubtargetFeature< "use-reciprocal-square-root", "UseRSqrt", "true", "Use the reciprocal square root approximation">; +def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", + "NegativeImmediates", "false", + "Convert immediates and instructions " + "to their negated or complemented " + "equivalent when the immediate does " + "not fit in the encoding.">; + +def FeatureLSLFast : SubtargetFeature< + "lsl-fast", "HasLSLFast", "true", + "CPU has a fastpath logical shift of up to 3 places">; //===----------------------------------------------------------------------===// // Architectures. // def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", - "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>; + "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; @@ -123,6 +147,7 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", //===----------------------------------------------------------------------===// include "AArch64RegisterInfo.td" +include "AArch64RegisterBanks.td" include "AArch64CallingConvention.td" //===----------------------------------------------------------------------===// @@ -149,7 +174,8 @@ include "AArch64SchedCyclone.td" include "AArch64SchedFalkor.td" include "AArch64SchedKryo.td" include "AArch64SchedM1.td" -include "AArch64SchedVulcan.td" +include "AArch64SchedThunderX.td" +include "AArch64SchedThunderX2T99.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -167,6 +193,7 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -180,6 +207,8 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseLiterals, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -191,6 +220,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; @@ -200,6 +230,7 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; @@ -226,6 +257,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -240,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, @@ -256,7 +289,8 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing + FeatureZCZeroing, + FeatureLSLFast ]>; def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", @@ -269,33 +303,80 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing + FeatureRDM, + FeatureZCZeroing, + FeatureLSLFast ]>; -def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan", - "Broadcom Vulcan processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeatureArithmeticBccFusion, - FeatureNEON, - FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - HasV8_1aOps]>; +def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", + "ThunderX2T99", + "Cavium ThunderX2 processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + HasV8_1aOps]>; + +def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily", + "ThunderXT88", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily", + "ThunderXT81", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", + "ThunderXT83", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; def : ProcessorModel<"generic", NoSchedModel, [ - FeatureCRC, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler ]>; -// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +// FIXME: Cortex-A35 is currently modeled as a Cortex-A53. def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57. +// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; @@ -304,7 +385,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; -def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>; +// Cavium ThunderX/ThunderX T8X Processors +def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>; +def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>; +def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; +def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; +// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. +def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 0aa597b..db1fbe0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -493,43 +493,30 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB) { - RegScavenger RS; - RS.enterBasicBlock(MBB); - RS.forward(MachineBasicBlock::iterator(G->getStart())); - // Can we find an appropriate register that is available throughout the life - // of the chain? - unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; - BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) { - RS.forward(I); - AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); - - // Remove any registers clobbered by a regmask or any def register that is - // immediately dead. - for (auto J : I->operands()) { - if (J.isRegMask()) - AvailableRegs.clearBitsNotInMask(J.getRegMask()); - - if (J.isReg() && J.isDef()) { - MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true); - if (J.isDead()) - for (; AI.isValid(); ++AI) - AvailableRegs.reset(*AI); -#ifndef NDEBUG - else - for (; AI.isValid(); ++AI) - assert(!AvailableRegs[*AI] && - "Non-dead def should have been removed by now!"); -#endif - } - } + // of the chain? Simulate liveness backwards until the end of the chain. + LiveRegUnits Units(*TRI); + Units.addLiveOuts(MBB); + MachineBasicBlock::iterator I = MBB.end(); + MachineBasicBlock::iterator ChainEnd = G->end(); + while (I != ChainEnd) { + --I; + Units.stepBackward(*I); } + // Check which register units are alive throughout the chain. + MachineBasicBlock::iterator ChainBegin = G->begin(); + assert(ChainBegin != ChainEnd && "Chain should contain instructions"); + do { + --I; + Units.accumulate(*I); + } while (I != ChainBegin); + // Make sure we allocate in-order, to get the cheapest registers first. + unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass; auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID)); for (auto Reg : Ord) { - if (!AvailableRegs[Reg]) + if (!Units.available(Reg)) continue; if (C == getColor(Reg)) return Reg; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp deleted file mode 100644 index 0cbb2db..0000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ /dev/null @@ -1,484 +0,0 @@ -//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to promote the computations use to obtained a sign extended -// value used into memory accesses. -// E.g. -// a = add nsw i32 b, 3 -// d = sext i32 a to i64 -// e = getelementptr ..., i64 d -// -// => -// f = sext i32 b to i64 -// a = add nsw i64 f, 3 -// e = getelementptr ..., i64 a -// -// This is legal to do if the computations are marked with either nsw or nuw -// markers. Moreover, the current heuristic is simple: it does not create new -// sext operations, i.e., it gives up when a sext would have forked (e.g., if a -// = add i32 b, c, two sexts are required to promote the computation). -// -// FIXME: This pass may be useful for other targets too. -// ===---------------------------------------------------------------------===// - -#include "AArch64.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "aarch64-type-promotion" - -static cl::opt<bool> -EnableMerge("aarch64-type-promotion-merge", cl::Hidden, - cl::desc("Enable merging of redundant sexts when one is dominating" - " the other."), - cl::init(true)); - -#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion" - -//===----------------------------------------------------------------------===// -// AArch64AddressTypePromotion -//===----------------------------------------------------------------------===// - -namespace { -class AArch64AddressTypePromotion : public FunctionPass { - -public: - static char ID; - AArch64AddressTypePromotion() - : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) { - initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry()); - } - - StringRef getPassName() const override { return AARCH64_TYPE_PROMO_NAME; } - - /// Iterate over the functions and promote the computation of interesting - // sext instructions. - bool runOnFunction(Function &F) override; - -private: - /// The current function. - Function *Func; - /// Filter out all sexts that does not have this type. - /// Currently initialized with Int64Ty. - Type *ConsideredSExtType; - - // This transformation requires dominator info. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - FunctionPass::getAnalysisUsage(AU); - } - - typedef SmallPtrSet<Instruction *, 32> SetOfInstructions; - typedef SmallVector<Instruction *, 16> Instructions; - typedef DenseMap<Value *, Instructions> ValueToInsts; - - /// Check if it is profitable to move a sext through this instruction. - /// Currently, we consider it is profitable if: - /// - Inst is used only once (no need to insert truncate). - /// - Inst has only one operand that will require a sext operation (we do - /// do not create new sext operation). - bool shouldGetThrough(const Instruction *Inst); - - /// Check if it is possible and legal to move a sext through this - /// instruction. - /// Current heuristic considers that we can get through: - /// - Arithmetic operation marked with the nsw or nuw flag. - /// - Other sext operation. - /// - Truncate operation if it was just dropping sign extended bits. - bool canGetThrough(const Instruction *Inst); - - /// Move sext operations through safe to sext instructions. - bool propagateSignExtension(Instructions &SExtInsts); - - /// Is this sext should be considered for code motion. - /// We look for sext with ConsideredSExtType and uses in at least one - // GetElementPtrInst. - bool shouldConsiderSExt(const Instruction *SExt) const; - - /// Collect all interesting sext operations, i.e., the ones with the right - /// type and used in memory accesses. - /// More precisely, a sext instruction is considered as interesting if it - /// is used in a "complex" getelementptr or it exits at least another - /// sext instruction that sign extended the same initial value. - /// A getelementptr is considered as "complex" if it has more than 2 - // operands. - void analyzeSExtension(Instructions &SExtInsts); - - /// Merge redundant sign extension operations in common dominator. - void mergeSExts(ValueToInsts &ValToSExtendedUses, - SetOfInstructions &ToRemove); -}; -} // end anonymous namespace. - -char AArch64AddressTypePromotion::ID = 0; - -INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", - AARCH64_TYPE_PROMO_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", - AARCH64_TYPE_PROMO_NAME, false, false) - -FunctionPass *llvm::createAArch64AddressTypePromotionPass() { - return new AArch64AddressTypePromotion(); -} - -bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) { - if (isa<SExtInst>(Inst)) - return true; - - const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); - if (BinOp && isa<OverflowingBinaryOperator>(BinOp) && - (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap())) - return true; - - // sext(trunc(sext)) --> sext - if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) { - const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0)); - // Check that the truncate just drop sign extended bits. - if (Inst->getType()->getIntegerBitWidth() >= - Opnd->getOperand(0)->getType()->getIntegerBitWidth() && - Inst->getOperand(0)->getType()->getIntegerBitWidth() <= - ConsideredSExtType->getIntegerBitWidth()) - return true; - } - - return false; -} - -bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { - // If the type of the sext is the same as the considered one, this sext - // will become useless. - // Otherwise, we will have to do something to preserve the original value, - // unless it is used once. - if (isa<SExtInst>(Inst) && - (Inst->getType() == ConsideredSExtType || Inst->hasOneUse())) - return true; - - // If the Inst is used more that once, we may need to insert truncate - // operations and we don't do that at the moment. - if (!Inst->hasOneUse()) - return false; - - // This truncate is used only once, thus if we can get thourgh, it will become - // useless. - if (isa<TruncInst>(Inst)) - return true; - - // If both operands are not constant, a new sext will be created here. - // Current heuristic is: each step should be profitable. - // Therefore we don't allow to increase the number of sext even if it may - // be profitable later on. - if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1))) - return true; - - return false; -} - -static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { - return !(isa<SelectInst>(Inst) && OpIdx == 0); -} - -bool -AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { - if (SExt->getType() != ConsideredSExtType) - return false; - - for (const User *U : SExt->users()) { - if (isa<GetElementPtrInst>(U)) - return true; - } - - return false; -} - -// Input: -// - SExtInsts contains all the sext instructions that are used directly in -// GetElementPtrInst, i.e., access to memory. -// Algorithm: -// - For each sext operation in SExtInsts: -// Let var be the operand of sext. -// while it is profitable (see shouldGetThrough), legal, and safe -// (see canGetThrough) to move sext through var's definition: -// * promote the type of var's definition. -// * fold var into sext uses. -// * move sext above var's definition. -// * update sext operand to use the operand of var that should be sign -// extended (by construction there is only one). -// -// E.g., -// a = ... i32 c, 3 -// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a' -// ... -// = b -// => Yes, update the code -// b = sext i32 c to i64 -// a = ... i64 b, 3 -// ... -// = a -// Iterate on 'c'. -bool -AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { - DEBUG(dbgs() << "*** Propagate Sign Extension ***\n"); - - bool LocalChange = false; - SetOfInstructions ToRemove; - ValueToInsts ValToSExtendedUses; - while (!SExtInsts.empty()) { - // Get through simple chain. - Instruction *SExt = SExtInsts.pop_back_val(); - - DEBUG(dbgs() << "Consider:\n" << *SExt << '\n'); - - // If this SExt has already been merged continue. - if (SExt->use_empty() && ToRemove.count(SExt)) { - DEBUG(dbgs() << "No uses => marked as delete\n"); - continue; - } - - // Now try to get through the chain of definitions. - while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) { - DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); - if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { - // We cannot get through something that is not an Instruction - // or not safe to SExt. - DEBUG(dbgs() << "Cannot get through\n"); - break; - } - - LocalChange = true; - // If this is a sign extend, it becomes useless. - if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) { - DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n"); - // We cannot use replaceAllUsesWith here because we may trigger some - // assertion on the type as all involved sext operation may have not - // been moved yet. - while (!Inst->use_empty()) { - Use &U = *Inst->use_begin(); - Instruction *User = dyn_cast<Instruction>(U.getUser()); - assert(User && "User of sext is not an Instruction!"); - User->setOperand(U.getOperandNo(), SExt); - } - ToRemove.insert(Inst); - SExt->setOperand(0, Inst->getOperand(0)); - SExt->moveBefore(Inst); - continue; - } - - // Get through the Instruction: - // 1. Update its type. - // 2. Replace the uses of SExt by Inst. - // 3. Sign extend each operand that needs to be sign extended. - - // Step #1. - Inst->mutateType(SExt->getType()); - // Step #2. - SExt->replaceAllUsesWith(Inst); - // Step #3. - Instruction *SExtForOpnd = SExt; - - DEBUG(dbgs() << "Propagate SExt to operands\n"); - for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx; - ++OpIdx) { - DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n'); - if (Inst->getOperand(OpIdx)->getType() == SExt->getType() || - !shouldSExtOperand(Inst, OpIdx)) { - DEBUG(dbgs() << "No need to propagate\n"); - continue; - } - // Check if we can statically sign extend the operand. - Value *Opnd = Inst->getOperand(OpIdx); - if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) { - DEBUG(dbgs() << "Statically sign extend\n"); - Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(), - Cst->getSExtValue())); - continue; - } - // UndefValue are typed, so we have to statically sign extend them. - if (isa<UndefValue>(Opnd)) { - DEBUG(dbgs() << "Statically sign extend\n"); - Inst->setOperand(OpIdx, UndefValue::get(SExt->getType())); - continue; - } - - // Otherwise we have to explicity sign extend it. - assert(SExtForOpnd && - "Only one operand should have been sign extended"); - - SExtForOpnd->setOperand(0, Opnd); - - DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n"); - // Move the sign extension before the insertion point. - SExtForOpnd->moveBefore(Inst); - Inst->setOperand(OpIdx, SExtForOpnd); - // If more sext are required, new instructions will have to be created. - SExtForOpnd = nullptr; - } - if (SExtForOpnd == SExt) { - DEBUG(dbgs() << "Sign extension is useless now\n"); - ToRemove.insert(SExt); - break; - } - } - - // If the use is already of the right type, connect its uses to its argument - // and delete it. - // This can happen for an Instruction all uses of which are sign extended. - if (!ToRemove.count(SExt) && - SExt->getType() == SExt->getOperand(0)->getType()) { - DEBUG(dbgs() << "Sign extension is useless, attach its use to " - "its argument\n"); - SExt->replaceAllUsesWith(SExt->getOperand(0)); - ToRemove.insert(SExt); - } else - ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt); - } - - if (EnableMerge) - mergeSExts(ValToSExtendedUses, ToRemove); - - // Remove all instructions marked as ToRemove. - for (Instruction *I: ToRemove) - I->eraseFromParent(); - return LocalChange; -} - -void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, - SetOfInstructions &ToRemove) { - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - for (auto &Entry : ValToSExtendedUses) { - Instructions &Insts = Entry.second; - Instructions CurPts; - for (Instruction *Inst : Insts) { - if (ToRemove.count(Inst)) - continue; - bool inserted = false; - for (auto &Pt : CurPts) { - if (DT.dominates(Inst, Pt)) { - DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" - << *Inst << '\n'); - Pt->replaceAllUsesWith(Inst); - ToRemove.insert(Pt); - Pt = Inst; - inserted = true; - break; - } - if (!DT.dominates(Pt, Inst)) - // Give up if we need to merge in a common dominator as the - // expermients show it is not profitable. - continue; - - DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n" - << *Pt << '\n'); - Inst->replaceAllUsesWith(Pt); - ToRemove.insert(Inst); - inserted = true; - break; - } - if (!inserted) - CurPts.push_back(Inst); - } - } -} - -void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { - DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n"); - - DenseMap<Value *, Instruction *> SeenChains; - - for (auto &BB : *Func) { - for (auto &II : BB) { - Instruction *SExt = &II; - - // Collect all sext operation per type. - if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt)) - continue; - - DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n'); - - // Cases where we actually perform the optimization: - // 1. SExt is used in a getelementptr with more than 2 operand => - // likely we can merge some computation if they are done on 64 bits. - // 2. The beginning of the SExt chain is SExt several time. => - // code sharing is possible. - - bool insert = false; - // #1. - for (const User *U : SExt->users()) { - const Instruction *Inst = dyn_cast<GetElementPtrInst>(U); - if (Inst && Inst->getNumOperands() > 2) { - DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst - << '\n'); - insert = true; - break; - } - } - - // #2. - // Check the head of the chain. - Instruction *Inst = SExt; - Value *Last; - do { - int OpdIdx = 0; - const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); - if (BinOp && isa<ConstantInt>(BinOp->getOperand(0))) - OpdIdx = 1; - Last = Inst->getOperand(OpdIdx); - Inst = dyn_cast<Instruction>(Last); - } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst)); - - DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n'); - DenseMap<Value *, Instruction *>::iterator AlreadySeen = - SeenChains.find(Last); - if (insert || AlreadySeen != SeenChains.end()) { - DEBUG(dbgs() << "Insert\n"); - SExtInsts.push_back(SExt); - if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) { - DEBUG(dbgs() << "Insert chain member\n"); - SExtInsts.push_back(AlreadySeen->second); - SeenChains[Last] = nullptr; - } - } else { - DEBUG(dbgs() << "Record its chain membership\n"); - SeenChains[Last] = SExt; - } - } - } -} - -bool AArch64AddressTypePromotion::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - if (F.isDeclaration()) - return false; - Func = &F; - ConsideredSExtType = Type::getInt64Ty(Func->getContext()); - - DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n'); - - Instructions SExtInsts; - analyzeSExtension(SExtInsts); - return propagateSignExtension(SExtInsts); -} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index efc2218..5ce5792 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -12,13 +12,13 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64.h" #include "AArch64MCInstLower.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "InstPrinter/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" @@ -35,11 +35,11 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCLinkerOptimizationHint.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, switch (ExtraCode[0]) { default: return true; // Unknown modifier. + case 'a': // Print 'a' modifier + PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O); + return false; case 'w': // Print W register case 'x': // Print X register if (MO.isReg()) @@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) + if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a') return true; // Unknown modifier. const MachineOperand &MO = MI->getOperand(OpNum); @@ -580,8 +583,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &MO_Sym = MI->getOperand(0); MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym); MCOperand Sym, SymTLSDescLo12, SymTLSDesc; - MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); + MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE); MCInstLowering.lowerOperand(MO_Sym, Sym); MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp index a4950af..29f6d57 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===// +//===--- AArch64CallLowering.cpp - Call lowering --------------------------===// // // The LLVM Compiler Infrastructure // @@ -15,15 +15,36 @@ #include "AArch64CallLowering.h" #include "AArch64ISelLowering.h" - +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL @@ -31,12 +52,12 @@ using namespace llvm; #endif AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI) - : CallLowering(&TLI) { -} + : CallLowering(&TLI) {} struct IncomingArgHandler : public CallLowering::ValueHandler { - IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) - : ValueHandler(MIRBuilder, MRI) {} + IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -45,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64)); MIRBuilder.buildFrameIndex(AddrReg, FI); + StackUsed = std::max(StackUsed, Size + Offset); return AddrReg; } @@ -67,11 +89,14 @@ struct IncomingArgHandler : public CallLowering::ValueHandler { /// parameters (it's a basic-block live-in), and a call instruction /// (it's an implicit-def of the BL). virtual void markPhysRegUsed(unsigned PhysReg) = 0; + + uint64_t StackUsed; }; struct FormalArgHandler : public IncomingArgHandler { - FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI) - : IncomingArgHandler(MIRBuilder, MRI) {} + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { MIRBuilder.getMBB().addLiveIn(PhysReg); @@ -80,8 +105,8 @@ struct FormalArgHandler : public IncomingArgHandler { struct CallReturnHandler : public IncomingArgHandler { CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB) - : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {} + MachineInstrBuilder MIB, CCAssignFn *AssignFn) + : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} void markPhysRegUsed(unsigned PhysReg) override { MIB.addDef(PhysReg, RegState::Implicit); @@ -92,8 +117,10 @@ struct CallReturnHandler : public IncomingArgHandler { struct OutgoingArgHandler : public CallLowering::ValueHandler { OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder MIB) - : ValueHandler(MIRBuilder, MRI), MIB(MIB) {} + MachineInstrBuilder MIB, CCAssignFn *AssignFn, + CCAssignFn *AssignFnVarArg) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + AssignFnVarArg(AssignFnVarArg), StackSize(0) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -126,14 +153,29 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { MIRBuilder.buildStore(ValVReg, Addr, *MMO); } + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, + CCState &State) override { + bool Res; + if (Info.IsFixed) + Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + else + Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + + StackSize = State.getNextStackOffset(); + return Res; + } + MachineInstrBuilder MIB; + CCAssignFn *AssignFnVarArg; + uint64_t StackSize; }; -void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, - SmallVectorImpl<ArgInfo> &SplitArgs, - const DataLayout &DL, - MachineRegisterInfo &MRI, - SplitArgTy PerformArgSplit) const { +void AArch64CallLowering::splitToValueTypes( + const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + const SplitArgTy &PerformArgSplit) const { const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); LLVMContext &Ctx = OrigArg.Ty->getContext(); @@ -145,7 +187,7 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags); + OrigArg.Flags, OrigArg.IsFixed); return; } @@ -154,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS). Type *SplitTy = SplitVT.getTypeForEVT(Ctx); SplitArgs.push_back( - ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy, - OrigArg.Flags}); + ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)), + SplitTy, OrigArg.Flags, OrigArg.IsFixed}); } - SmallVector<uint64_t, 4> BitOffsets; - for (auto Offset : Offsets) - BitOffsets.push_back(Offset * 8); - - SmallVector<unsigned, 8> SplitRegs; - for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I) - SplitRegs.push_back(I->Reg); - - PerformArgSplit(SplitRegs, BitOffsets); + for (unsigned i = 0; i < Offsets.size(); ++i) + PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8); } bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, @@ -184,16 +219,16 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, auto &DL = F.getParent()->getDataLayout(); ArgInfo OrigArg{VReg, Val->getType()}; - setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F); + setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 8> SplitArgs; splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildExtract(Regs, Offsets, VReg); + [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, VReg, Offset); }); - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB); - Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler); + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn); + Success = handleAssignments(MIRBuilder, SplitArgs, Handler); } MIRBuilder.insertInstr(MIB); @@ -203,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { - auto &Args = F.getArgumentList(); MachineFunction &MF = MIRBuilder.getMF(); MachineBasicBlock &MBB = MIRBuilder.getMBB(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -211,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; unsigned i = 0; - for (auto &Arg : Args) { + for (auto &Arg : F.args()) { ArgInfo OrigArg{VRegs[i], Arg.getType()}; - setArgFlags(OrigArg, i + 1, DL, F); + setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); + bool Split = false; + LLT Ty = MRI.getType(VRegs[i]); + unsigned Dst = VRegs[i]; + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildSequence(VRegs[i], Regs, Offsets); + [&](unsigned Reg, uint64_t Offset) { + if (!Split) { + Split = true; + Dst = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Dst); + } + unsigned Tmp = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); + Dst = Tmp; }); + + if (Dst != VRegs[i]) + MIRBuilder.buildCopy(VRegs[i], Dst); ++i; } @@ -228,10 +276,25 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCAssignFn *AssignFn = TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); - FormalArgHandler Handler(MIRBuilder, MRI); - if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler)) + FormalArgHandler Handler(MIRBuilder, MRI, AssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; + if (F.isVarArg()) { + if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) { + // FIXME: we need to reimplement saveVarArgsRegisters from + // AArch64ISelLowering. + return false; + } + + // We currently pass all varargs at 8-byte alignment. + uint64_t StackOffset = alignTo(Handler.StackUsed, 8); + + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); + } + // Move back to the end of the basic block. MIRBuilder.setMBB(MBB); @@ -239,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, } bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallingConv::ID CallConv, const MachineOperand &Callee, const ArgInfo &OrigRet, ArrayRef<ArgInfo> OrigArgs) const { @@ -250,21 +314,25 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; for (auto &OrigArg : OrigArgs) { splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg); + [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset); }); } // Find out which ABI gets to decide where things go. const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>(); - CCAssignFn *CallAssignFn = - TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); + CCAssignFn *AssignFnFixed = + TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + CCAssignFn *AssignFnVarArg = + TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true); + + auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR : AArch64::BL); - MIB.addOperand(Callee); + MIB.add(Callee); // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget().getRegisterInfo(); @@ -272,8 +340,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Do the actual argument marshalling. SmallVector<unsigned, 8> PhysRegs; - OutgoingArgHandler Handler(MIRBuilder, MRI, MIB); - if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler)) + OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, + AssignFnVarArg); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; // Now we can add the actual call instruction to the correct basic block. @@ -298,20 +367,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector<uint64_t, 8> RegOffsets; SmallVector<unsigned, 8> SplitRegs; splitToValueTypes(OrigRet, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) { - std::copy(Offsets.begin(), Offsets.end(), - std::back_inserter(RegOffsets)); - std::copy(Regs.begin(), Regs.end(), - std::back_inserter(SplitRegs)); + [&](unsigned Reg, uint64_t Offset) { + RegOffsets.push_back(Offset); + SplitRegs.push_back(Reg); }); - CallReturnHandler Handler(MIRBuilder, MRI, MIB); - if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler)) + CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; if (!RegOffsets.empty()) MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets); } + CallSeqStart.addImm(Handler.StackSize).addImm(0); + MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) + .addImm(Handler.StackSize) + .addImm(0); + return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h index ce66762..d96ce95 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===// +//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,18 +12,20 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING -#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/CodeGen/ValueTypes.h" +#include <cstdint> +#include <functional> namespace llvm { class AArch64TargetLowering; class AArch64CallLowering: public CallLowering { - public: +public: AArch64CallLowering(const AArch64TargetLowering &TLI); bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, @@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering { bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee, - const ArgInfo &OrigRet, + bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, + const MachineOperand &Callee, const ArgInfo &OrigRet, ArrayRef<ArgInfo> OrigArgs) const override; private: @@ -44,13 +46,14 @@ private: typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)> MemHandler; - typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)> - SplitArgTy; + typedef std::function<void(unsigned, uint64_t)> SplitArgTy; void splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, - SplitArgTy SplitArg) const; + const SplitArgTy &SplitArg) const; }; -} // End of namespace llvm; -#endif + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 938779d..291bc5e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; +// Vararg functions on windows pass floats in integer registers +def CC_AArch64_Win64_VarArg : CallingConv<[ + CCIfType<[f16, f32], CCPromoteToType<f64>>, + CCIfType<[f64], CCBitConvertToType<i64>>, + CCDelegateTo<CC_AArch64_AAPCS> +]>; + // Darwin uses a calling convention which differs in only two ways // from the standard one at this level: diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 6f8dd3e..b3b7385 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -113,7 +113,7 @@ struct LDTLSCleanup : public MachineFunctionPass { return Copy; } - // Create a virtal register in *TLSBaseAddrReg, and populate it by + // Create a virtual register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I.getParent()->getParent(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp new file mode 100644 index 0000000..51700f9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -0,0 +1,339 @@ +//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions +/// into a conditional branch (B.cond), when the NZCV flags can be set for +/// "free". This is preferred on targets that have more flexibility when +/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming +/// all other variables are equal). This can also reduce register pressure. +/// +/// A few examples: +/// +/// 1) add w8, w0, w1 -> cmn w0, w1 ; CMN is an alias of ADDS. +/// cbz w8, .LBB_2 -> b.eq .LBB0_2 +/// +/// 2) add w8, w0, w1 -> adds w8, w0, w1 ; w8 has multiple uses. +/// cbz w8, .LBB1_2 -> b.eq .LBB1_2 +/// +/// 3) sub w8, w0, w1 -> subs w8, w0, w1 ; w8 has multiple uses. +/// tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2 +/// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-cond-br-tuning" +#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning" + +namespace { +class AArch64CondBrTuning : public MachineFunctionPass { + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + + MachineRegisterInfo *MRI; + +public: + static char ID; + AArch64CondBrTuning() : MachineFunctionPass(ID) { + initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; } + +private: + MachineInstr *getOperandDef(const MachineOperand &MO); + MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting); + MachineInstr *convertToCondBr(MachineInstr &MI); + bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI); +}; +} // end anonymous namespace + +char AArch64CondBrTuning::ID = 0; + +INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning", + AARCH64_CONDBR_TUNING_NAME, false, false) + +void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) { + if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return nullptr; + return MRI->getUniqueVRegDef(MO.getReg()); +} + +MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI, + bool IsFlagSetting) { + // If this is already the flag setting version of the instruction (e.g., SUBS) + // just make sure the implicit-def of NZCV isn't marked dead. + if (IsFlagSetting) { + for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands(); + I != E; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV) + MO.setIsDead(false); + } + return &MI; + } + bool Is64Bit; + unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit); + unsigned NewDestReg = MI.getOperand(0).getReg(); + if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg())) + NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(NewOpc), NewDestReg); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + + return MIB; +} + +MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) { + AArch64CC::CondCode CC; + MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI); + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZW: + case AArch64::CBZX: + CC = AArch64CC::EQ; + break; + case AArch64::CBNZW: + case AArch64::CBNZX: + CC = AArch64CC::NE; + break; + case AArch64::TBZW: + case AArch64::TBZX: + CC = AArch64CC::PL; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + CC = AArch64CC::MI; + break; + } + return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TargetMBB); +} + +bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, + MachineInstr &DefMI) { + // We don't want NZCV bits live across blocks. + if (MI.getParent() != DefMI.getParent()) + return false; + + bool IsFlagSetting = true; + unsigned MIOpc = MI.getOpcode(); + MachineInstr *NewCmp = nullptr, *NewBr = nullptr; + switch (DefMI.getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDWrr: + case AArch64::ADDWrs: + case AArch64::ADDWrx: + case AArch64::ANDWri: + case AArch64::ANDWrr: + case AArch64::ANDWrs: + case AArch64::BICWrr: + case AArch64::BICWrs: + case AArch64::SUBWri: + case AArch64::SUBWrr: + case AArch64::SUBWrs: + case AArch64::SUBWrx: + IsFlagSetting = false; + LLVM_FALLTHROUGH; + case AArch64::ADDSWri: + case AArch64::ADDSWrr: + case AArch64::ADDSWrs: + case AArch64::ADDSWrx: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::BICSWrr: + case AArch64::BICSWrs: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSWrs: + case AArch64::SUBSWrx: + switch (MIOpc) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::TBZW: + case AArch64::TBNZW: + // Check to see if the TBZ/TBNZ is checking the sign bit. + if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) && + MI.getOperand(1).getImm() != 31) + return false; + + // There must not be any instruction between DefMI and MI that clobbers or + // reads NZCV. + MachineBasicBlock::iterator I(DefMI), E(MI); + for (I = std::next(I); I != E; ++I) { + if (I->modifiesRegister(AArch64::NZCV, TRI) || + I->readsRegister(AArch64::NZCV, TRI)) + return false; + } + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(DefMI.print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MI.print(dbgs())); + + NewCmp = convertToFlagSetting(DefMI, IsFlagSetting); + NewBr = convertToCondBr(MI); + break; + } + break; + + case AArch64::ADDXri: + case AArch64::ADDXrr: + case AArch64::ADDXrs: + case AArch64::ADDXrx: + case AArch64::ANDXri: + case AArch64::ANDXrr: + case AArch64::ANDXrs: + case AArch64::BICXrr: + case AArch64::BICXrs: + case AArch64::SUBXri: + case AArch64::SUBXrr: + case AArch64::SUBXrs: + case AArch64::SUBXrx: + IsFlagSetting = false; + LLVM_FALLTHROUGH; + case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ADDSXrs: + case AArch64::ADDSXrx: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSXrr: + case AArch64::BICSXrs: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::SUBSXrs: + case AArch64::SUBSXrx: + switch (MIOpc) { + default: + llvm_unreachable("Unexpected opcode!"); + + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::TBZX: + case AArch64::TBNZX: { + // Check to see if the TBZ/TBNZ is checking the sign bit. + if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) && + MI.getOperand(1).getImm() != 63) + return false; + // There must not be any instruction between DefMI and MI that clobbers or + // reads NZCV. + MachineBasicBlock::iterator I(DefMI), E(MI); + for (I = std::next(I); I != E; ++I) { + if (I->modifiesRegister(AArch64::NZCV, TRI) || + I->readsRegister(AArch64::NZCV, TRI)) + return false; + } + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(DefMI.print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MI.print(dbgs())); + + NewCmp = convertToFlagSetting(DefMI, IsFlagSetting); + NewBr = convertToCondBr(MI); + break; + } + } + break; + } + (void)NewCmp; (void)NewBr; + assert(NewCmp && NewBr && "Expected new instructions."); + + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(NewCmp->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(NewBr->print(dbgs())); + + // If this was a flag setting version of the instruction, we use the original + // instruction by just clearing the dead marked on the implicit-def of NCZV. + // Therefore, we should not erase this instruction. + if (!IsFlagSetting) + DefMI.eraseFromParent(); + MI.eraseFromParent(); + return true; +} + +bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n" + << "********** Function: " << MF.getName() << '\n'); + + TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + bool LocalChange = false; + for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(), + E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + default: + break; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + MachineInstr *DefMI = getOperandDef(MI.getOperand(0)); + LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI)); + break; + } + // If the optimization was successful, we can't optimize any other + // branches because doing so would clobber the NZCV flags. + if (LocalChange) { + Changed = true; + break; + } + } + } + return Changed; +} + +FunctionPass *llvm::createAArch64CondBrTuning() { + return new AArch64CondBrTuning(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 8b18632..2dfcd2d 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -265,10 +265,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // Change immediate in comparison instruction (ADDS or SUBS). BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc)) - .addOperand(CmpMI->getOperand(0)) - .addOperand(CmpMI->getOperand(1)) + .add(CmpMI->getOperand(0)) + .add(CmpMI->getOperand(1)) .addImm(Imm) - .addOperand(CmpMI->getOperand(3)); + .add(CmpMI->getOperand(3)); CmpMI->eraseFromParent(); // The fact that this comparison was picked ensures that it's related to the @@ -278,7 +278,7 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, // Change condition in branch instruction. BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(Cmp) - .addOperand(BrMI.getOperand(1)); + .add(BrMI.getOperand(1)); BrMI.eraseFromParent(); MBB->updateTerminator(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index da09b36..9eda56c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -139,6 +140,7 @@ class SSACCmpConv { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; + const MachineBranchProbabilityInfo *MBPI; public: /// The first block containing a conditional branch, dominating everything @@ -186,8 +188,10 @@ private: public: /// runOnMachineFunction - Initialize per-function data structures. - void runOnMachineFunction(MachineFunction &MF) { + void runOnMachineFunction(MachineFunction &MF, + const MachineBranchProbabilityInfo *MBPI) { this->MF = &MF; + this->MBPI = MBPI; TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. updateTailPHIs(); - Head->removeSuccessor(CmpBB, true); - CmpBB->removeSuccessor(Tail, true); + + // Save successor probabilties before removing CmpBB and Tail from their + // parents. + BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB); + BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail); + + Head->removeSuccessor(CmpBB); + CmpBB->removeSuccessor(Tail); + + // If Head and CmpBB had successor probabilties, udpate the probabilities to + // reflect the ccmp-conversion. + if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) { + + // Head is allowed two successors. We've removed CmpBB, so the remaining + // successor is Tail. We need to increase the successor probability for + // Tail to account for the CmpBB path we removed. + // + // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB). + assert(*Head->succ_begin() == Tail && "Head successor is not Tail"); + BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail); + Head->setSuccProbability(Head->succ_begin(), + Head2Tail + Head2CmpBB * CmpBB2Tail); + + // We will transfer successors of CmpBB to Head in a moment without + // normalizing the successor probabilities. Set the successor probabilites + // before doing so. + // + // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB). + for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) { + BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I); + CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I); + } + } + Head->transferSuccessorsAndUpdatePHIs(CmpBB); DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); TII->removeBranch(*Head); @@ -594,7 +630,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) .addReg(DestReg, RegState::Define | RegState::Dead) - .addOperand(HeadCond[2]) + .add(HeadCond[2]) .addImm(0) .addImm(0); // SUBS uses the GPR*sp register classes. @@ -650,13 +686,12 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { if (CmpMI->getOperand(FirstOp + 1).isReg()) MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), TII->getRegClass(MCID, 1, TRI, *MF)); - MachineInstrBuilder MIB = - BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) - .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn + MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) + .add(CmpMI->getOperand(FirstOp)); // Register Rn if (isZBranch) MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0 else - MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate + MIB.add(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate MIB.addImm(NZCV).addImm(HeadCmpBBCC); // If CmpMI was a terminator, we need a new conditional branch to replace it. @@ -666,7 +701,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { CmpMI->getOpcode() == AArch64::CBNZX; BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc)) .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ) - .addOperand(CmpMI->getOperand(1)); // Branch target. + .add(CmpMI->getOperand(1)); // Branch target. } CmpMI->eraseFromParent(); Head->updateTerminator(); @@ -718,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const { namespace { class AArch64ConditionalCompares : public MachineFunctionPass { + const MachineBranchProbabilityInfo *MBPI; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MCSchedModel SchedModel; @@ -754,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", @@ -764,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { } void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); @@ -893,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); Loops = getAnalysisIfAvailable<MachineLoopInfo>(); + MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; MinSize = MF.getFunction()->optForMinSize(); bool Changed = false; - CmpConv.runOnMachineFunction(MF); + CmpConv.runOnMachineFunction(MF, MBPI); // Visit blocks in dominator tree pre-order. The pre-order enables multiple // cmp-conversions from the same head block. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 30e2b23..b72f23b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -13,15 +13,17 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-dead-defs" @@ -84,6 +86,55 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n"); continue; } + if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) { + // XZ/WZ for LSE can only be used when acquire semantics are not used, + // LDOPAL WZ is an invalid opcode. + switch (MI.getOpcode()) { + case AArch64::CASALb: + case AArch64::CASALh: + case AArch64::CASALs: + case AArch64::CASALd: + case AArch64::SWPALb: + case AArch64::SWPALh: + case AArch64::SWPALs: + case AArch64::SWPALd: + case AArch64::LDADDALb: + case AArch64::LDADDALh: + case AArch64::LDADDALs: + case AArch64::LDADDALd: + case AArch64::LDCLRALb: + case AArch64::LDCLRALh: + case AArch64::LDCLRALs: + case AArch64::LDCLRALd: + case AArch64::LDEORALb: + case AArch64::LDEORALh: + case AArch64::LDEORALs: + case AArch64::LDEORALd: + case AArch64::LDSETALb: + case AArch64::LDSETALh: + case AArch64::LDSETALs: + case AArch64::LDSETALd: + case AArch64::LDSMINALb: + case AArch64::LDSMINALh: + case AArch64::LDSMINALs: + case AArch64::LDSMINALd: + case AArch64::LDSMAXALb: + case AArch64::LDSMAXALh: + case AArch64::LDSMAXALs: + case AArch64::LDSMAXALd: + case AArch64::LDUMINALb: + case AArch64::LDUMINALh: + case AArch64::LDUMINALs: + case AArch64::LDUMINALd: + case AArch64::LDUMAXALb: + case AArch64::LDUMAXALh: + case AArch64::LDUMAXALs: + case AArch64::LDUMAXALd: + continue; + default: + break; + } + } const MCInstrDesc &Desc = MI.getDesc(); for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) { MachineOperand &MO = MI.getOperand(I); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index fe1c0be..d52cd84 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -14,9 +14,10 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -70,9 +71,9 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, const MachineOperand &MO = OldMI.getOperand(i); assert(MO.isReg() && MO.getReg()); if (MO.isUse()) - UseMI.addOperand(MO); + UseMI.add(MO); else - DefMI.addOperand(MO); + DefMI.add(MO); } } @@ -112,7 +113,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, // Create the ORR-immediate instruction. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -179,7 +180,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, // Create the ORR-immediate instruction. MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -362,7 +363,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(AArch64::XZR) .addImm(Encoding); @@ -425,7 +426,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) .addImm(Encoding); transferImpOps(MI, MIB, MIB); @@ -539,15 +540,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, if (Imm != 0) { unsigned LZ = countLeadingZeros(Imm); unsigned TZ = countTrailingZeros(Imm); - Shift = ((63 - LZ) / 16) * 16; - LastShift = (TZ / 16) * 16; + Shift = (TZ / 16) * 16; + LastShift = ((63 - LZ) / 16) * 16; } unsigned Imm16 = (Imm >> Shift) & Mask; bool DstIsDead = MI.getOperand(0).isDead(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) + getDeadRegState(DstIsDead && Shift == LastShift)) .addImm(Imm16) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); @@ -564,15 +565,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MachineInstrBuilder MIB2; unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); - while (Shift != LastShift) { - Shift -= 16; + while (Shift < LastShift) { + Shift += 16; Imm16 = (Imm >> Shift) & Mask; if (Imm16 == (isNeg ? Mask : 0)) continue; // This 16-bit portion is already set correctly. MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) + getDeadRegState(DstIsDead && Shift == LastShift)) .addReg(DstReg) .addImm(Imm16) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); @@ -583,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, return true; } -static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) { - for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I) - MBB->addLiveIn(*I); -} - bool AArch64ExpandPseudo::expandCMP_SWAP( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp, unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg, MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); unsigned StatusReg = MI.getOperand(1).getReg(); - MachineOperand &Addr = MI.getOperand(2); - MachineOperand &Desired = MI.getOperand(3); - MachineOperand &New = MI.getOperand(4); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(1).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(2).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(2).getReg(); + unsigned DesiredReg = MI.getOperand(3).getReg(); + unsigned NewReg = MI.getOperand(4).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -615,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( MF->insert(++StoreBB->getIterator(), DoneBB); // .Lloadcmp: + // mov wStatus, 0 // ldaxr xDest, [xAddr] // cmp xDest, xDesired // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(Dest.getReg()); - LoadCmpBB->addLiveIn(Desired.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - + if (!StatusDead) + BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg) + .addImm(0).addImm(0); BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg()) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg) .addReg(Dest.getReg(), getKillRegState(Dest.isDead())) - .addOperand(Desired) + .addReg(DesiredReg) .addImm(ExtendImm); BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc)) .addImm(AArch64CC::NE) @@ -639,27 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP( // .Lstore: // stlxr wStatus, xNew, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(New.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); - BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg) - .addOperand(New) - .addOperand(Addr); + .addReg(NewReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute livein lists. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass around the loop to get loop carried registers right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -672,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( MachineOperand &DestLo = MI.getOperand(0); MachineOperand &DestHi = MI.getOperand(1); unsigned StatusReg = MI.getOperand(2).getReg(); - MachineOperand &Addr = MI.getOperand(3); - MachineOperand &DesiredLo = MI.getOperand(4); - MachineOperand &DesiredHi = MI.getOperand(5); - MachineOperand &NewLo = MI.getOperand(6); - MachineOperand &NewHi = MI.getOperand(7); - - LivePhysRegs LiveRegs(&TII->getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - for (auto I = std::prev(MBB.end()); I != MBBI; --I) - LiveRegs.stepBackward(*I); + bool StatusDead = MI.getOperand(2).isDead(); + // Duplicating undef operands into 2 instructions does not guarantee the same + // value on both; However undef should be replaced by xzr anyway. + assert(!MI.getOperand(3).isUndef() && "cannot handle undef"); + unsigned AddrReg = MI.getOperand(3).getReg(); + unsigned DesiredLoReg = MI.getOperand(4).getReg(); + unsigned DesiredHiReg = MI.getOperand(5).getReg(); + unsigned NewLoReg = MI.getOperand(6).getReg(); + unsigned NewHiReg = MI.getOperand(7).getReg(); MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); @@ -697,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // cmp xDestLo, xDesiredLo // sbcs xDestHi, xDesiredHi // b.ne .Ldone - LoadCmpBB->addLiveIn(Addr.getReg()); - LoadCmpBB->addLiveIn(DestLo.getReg()); - LoadCmpBB->addLiveIn(DestHi.getReg()); - LoadCmpBB->addLiveIn(DesiredLo.getReg()); - LoadCmpBB->addLiveIn(DesiredHi.getReg()); - addPostLoopLiveIns(LoadCmpBB, LiveRegs); - BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) .addReg(DestLo.getReg(), RegState::Define) .addReg(DestHi.getReg(), RegState::Define) - .addReg(Addr.getReg()); + .addReg(AddrReg); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead())) - .addOperand(DesiredLo) + .addReg(DesiredLoReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(AArch64::WZR) @@ -718,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR) .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead())) - .addOperand(DesiredHi) + .addReg(DesiredHiReg) .addImm(0); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg) .addUse(StatusReg, RegState::Kill) .addUse(StatusReg, RegState::Kill) .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) - .addUse(StatusReg, RegState::Kill) + .addUse(StatusReg, getKillRegState(StatusDead)) .addMBB(DoneBB); LoadCmpBB->addSuccessor(DoneBB); LoadCmpBB->addSuccessor(StoreBB); @@ -733,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // .Lstore: // stlxp wStatus, xNewLo, xNewHi, [xAddr] // cbnz wStatus, .Lloadcmp - StoreBB->addLiveIn(Addr.getReg()); - StoreBB->addLiveIn(NewLo.getReg()); - StoreBB->addLiveIn(NewHi.getReg()); - addPostLoopLiveIns(StoreBB, LiveRegs); BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) - .addOperand(NewLo) - .addOperand(NewHi) - .addOperand(Addr); + .addReg(NewLoReg) + .addReg(NewHiReg) + .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) - .addReg(StatusReg, RegState::Kill) + .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); - addPostLoopLiveIns(DoneBB, LiveRegs); MBB.addSuccessor(LoadCmpBB); NextMBBI = MBB.end(); MI.eraseFromParent(); + + // Recompute liveness bottom up. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LivePhysRegs LiveRegs; + computeLiveIns(LiveRegs, MRI, *DoneBB); + computeLiveIns(LiveRegs, MRI, *StoreBB); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + // Do an extra pass in the loop to get the loop carried dependencies right. + StoreBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *StoreBB); + LoadCmpBB->clearLiveIns(); + computeLiveIns(LiveRegs, MRI, *LoadCmpBB); + return true; } @@ -825,8 +827,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); transferImpOps(MI, MIB1, MIB1); MI.eraseFromParent(); @@ -842,7 +844,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(DstReg); if (MO1.isGlobal()) { @@ -878,19 +880,31 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned DstReg = MI.getOperand(0).getReg(); MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) - .addOperand(MI.getOperand(1)); + .add(MI.getOperand(1)); MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) - .addOperand(MI.getOperand(0)) + .add(MI.getOperand(0)) .addReg(DstReg) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(2)) .addImm(0); transferImpOps(MI, MIB1, MIB2); MI.eraseFromParent(); return true; } + case AArch64::MOVbaseTLS: { + unsigned DstReg = MI.getOperand(0).getReg(); + auto SysReg = AArch64SysReg::TPIDR_EL0; + MachineFunction *MF = MBB.getParent(); + if (MF->getTarget().getTargetTriple().isOSFuchsia() && + MF->getTarget().getCodeModel() == CodeModel::Kernel) + SysReg = AArch64SysReg::TPIDR_EL1; + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg) + .addImm(SysReg); + MI.eraseFromParent(); + return true; + } case AArch64::MOVi32imm: return expandMOVImm(MBB, MBBI, 32); @@ -931,6 +945,19 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, AArch64::XZR, NextMBBI); case AArch64::CMP_SWAP_128: return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); + + case AArch64::AESMCrrTied: + case AArch64::AESIMCrrTied: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr : + AArch64::AESIMCrr)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } } return false; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp new file mode 100644 index 0000000..c0e2235 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -0,0 +1,790 @@ +//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions +/// that may inhibit the HW prefetching. This is done in two steps. Before +/// ISel, we mark strided loads (i.e. those that will likely benefit from +/// prefetching) with metadata. Then, after opcodes have been finalized, we +/// insert MOVs and re-write loads to prevent unintnentional tag collisions. +// ===---------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "falkor-hwpf-fix" + +STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); +STATISTIC(NumCollisionsAvoided, + "Number of HW prefetch tag collisions avoided"); +STATISTIC(NumCollisionsNotAvoided, + "Number of HW prefetch tag collisions not avoided due to lack of regsiters"); + +namespace { + +class FalkorMarkStridedAccesses { +public: + FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) + : LI(LI), SE(SE) {} + + bool run(); + +private: + bool runOnLoop(Loop &L); + + LoopInfo &LI; + ScalarEvolution &SE; +}; + +class FalkorMarkStridedAccessesLegacy : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { + initializeFalkorMarkStridedAccessesLegacyPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved<ScalarEvolutionWrapperPass>(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char FalkorMarkStridedAccessesLegacy::ID = 0; +INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) + +FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { + return new FalkorMarkStridedAccessesLegacy(); +} + +bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { + TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const AArch64Subtarget *ST = + TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F); + if (ST->getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(F)) + return false; + + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + + FalkorMarkStridedAccesses LDP(LI, SE); + return LDP.run(); +} + +bool FalkorMarkStridedAccesses::run() { + bool MadeChange = false; + + for (Loop *L : LI) + for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt) + MadeChange |= runOnLoop(**LIt); + + return MadeChange; +} + +bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { + // Only mark strided loads in the inner-most loop + if (!L.empty()) + return false; + + bool MadeChange = false; + + for (BasicBlock *BB : L.blocks()) { + for (Instruction &I : *BB) { + LoadInst *LoadI = dyn_cast<LoadInst>(&I); + if (!LoadI) + continue; + + Value *PtrValue = LoadI->getPointerOperand(); + if (L.isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, + MDNode::get(LoadI->getContext(), {})); + ++NumStridedLoadsMarked; + DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); + MadeChange = true; + } + } + + return MadeChange; +} + +namespace { + +class FalkorHWPFFix : public MachineFunctionPass { +public: + static char ID; + + FalkorHWPFFix() : MachineFunctionPass(ID) { + initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + void runOnLoop(MachineLoop &L, MachineFunction &Fn); + + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap; + bool Modified; +}; + +/// Bits from load opcodes used to compute HW prefetcher instruction tags. +struct LoadInfo { + LoadInfo() + : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr), + IsPrePost(false) {} + unsigned DestReg; + unsigned BaseReg; + int BaseRegIdx; + const MachineOperand *OffsetOpnd; + bool IsPrePost; +}; + +} // namespace + +char FalkorHWPFFix::ID = 0; + +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) + +static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { + return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8); +} + +static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { + int DestRegIdx; + int BaseRegIdx; + int OffsetIdx; + bool IsPrePost; + + switch (MI.getOpcode()) { + default: + return None; + + case AArch64::LD1i8: + case AArch64::LD1i16: + case AArch64::LD1i32: + case AArch64::LD1i64: + case AArch64::LD2i8: + case AArch64::LD2i16: + case AArch64::LD2i32: + case AArch64::LD2i64: + case AArch64::LD3i8: + case AArch64::LD3i16: + case AArch64::LD3i32: + case AArch64::LD4i8: + case AArch64::LD4i16: + case AArch64::LD4i32: + DestRegIdx = 0; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD3i64: + case AArch64::LD4i64: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d: + case AArch64::LD1Onev2s: + case AArch64::LD1Onev4h: + case AArch64::LD1Onev8b: + case AArch64::LD1Onev2d: + case AArch64::LD1Onev4s: + case AArch64::LD1Onev8h: + case AArch64::LD1Onev16b: + case AArch64::LD1Rv1d: + case AArch64::LD1Rv2s: + case AArch64::LD1Rv4h: + case AArch64::LD1Rv8b: + case AArch64::LD1Rv2d: + case AArch64::LD1Rv4s: + case AArch64::LD1Rv8h: + case AArch64::LD1Rv16b: + case AArch64::LD1Twov1d: + case AArch64::LD1Twov2s: + case AArch64::LD1Twov4h: + case AArch64::LD1Twov8b: + case AArch64::LD2Twov2s: + case AArch64::LD2Twov4s: + case AArch64::LD2Twov8b: + case AArch64::LD2Rv1d: + case AArch64::LD2Rv2s: + case AArch64::LD2Rv4s: + case AArch64::LD2Rv8b: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d: + case AArch64::LD1Twov4s: + case AArch64::LD1Twov8h: + case AArch64::LD1Twov16b: + case AArch64::LD1Threev1d: + case AArch64::LD1Threev2s: + case AArch64::LD1Threev4h: + case AArch64::LD1Threev8b: + case AArch64::LD1Threev2d: + case AArch64::LD1Threev4s: + case AArch64::LD1Threev8h: + case AArch64::LD1Threev16b: + case AArch64::LD1Fourv1d: + case AArch64::LD1Fourv2s: + case AArch64::LD1Fourv4h: + case AArch64::LD1Fourv8b: + case AArch64::LD1Fourv2d: + case AArch64::LD1Fourv4s: + case AArch64::LD1Fourv8h: + case AArch64::LD1Fourv16b: + case AArch64::LD2Twov2d: + case AArch64::LD2Twov4h: + case AArch64::LD2Twov8h: + case AArch64::LD2Twov16b: + case AArch64::LD2Rv2d: + case AArch64::LD2Rv4h: + case AArch64::LD2Rv8h: + case AArch64::LD2Rv16b: + case AArch64::LD3Threev2s: + case AArch64::LD3Threev4h: + case AArch64::LD3Threev8b: + case AArch64::LD3Threev2d: + case AArch64::LD3Threev4s: + case AArch64::LD3Threev8h: + case AArch64::LD3Threev16b: + case AArch64::LD3Rv1d: + case AArch64::LD3Rv2s: + case AArch64::LD3Rv4h: + case AArch64::LD3Rv8b: + case AArch64::LD3Rv2d: + case AArch64::LD3Rv4s: + case AArch64::LD3Rv8h: + case AArch64::LD3Rv16b: + case AArch64::LD4Fourv2s: + case AArch64::LD4Fourv4h: + case AArch64::LD4Fourv8b: + case AArch64::LD4Fourv2d: + case AArch64::LD4Fourv4s: + case AArch64::LD4Fourv8h: + case AArch64::LD4Fourv16b: + case AArch64::LD4Rv1d: + case AArch64::LD4Rv2s: + case AArch64::LD4Rv4h: + case AArch64::LD4Rv8b: + case AArch64::LD4Rv2d: + case AArch64::LD4Rv4s: + case AArch64::LD4Rv8h: + case AArch64::LD4Rv16b: + DestRegIdx = -1; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1i8_POST: + case AArch64::LD1i16_POST: + case AArch64::LD1i32_POST: + case AArch64::LD1i64_POST: + case AArch64::LD2i8_POST: + case AArch64::LD2i16_POST: + case AArch64::LD2i32_POST: + case AArch64::LD2i64_POST: + case AArch64::LD3i8_POST: + case AArch64::LD3i16_POST: + case AArch64::LD3i32_POST: + case AArch64::LD4i8_POST: + case AArch64::LD4i16_POST: + case AArch64::LD4i32_POST: + DestRegIdx = 1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD3i64_POST: + case AArch64::LD4i64_POST: + DestRegIdx = -1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d_POST: + case AArch64::LD1Onev2s_POST: + case AArch64::LD1Onev4h_POST: + case AArch64::LD1Onev8b_POST: + case AArch64::LD1Onev2d_POST: + case AArch64::LD1Onev4s_POST: + case AArch64::LD1Onev8h_POST: + case AArch64::LD1Onev16b_POST: + case AArch64::LD1Rv1d_POST: + case AArch64::LD1Rv2s_POST: + case AArch64::LD1Rv4h_POST: + case AArch64::LD1Rv8b_POST: + case AArch64::LD1Rv2d_POST: + case AArch64::LD1Rv4s_POST: + case AArch64::LD1Rv8h_POST: + case AArch64::LD1Rv16b_POST: + case AArch64::LD1Twov1d_POST: + case AArch64::LD1Twov2s_POST: + case AArch64::LD1Twov4h_POST: + case AArch64::LD1Twov8b_POST: + case AArch64::LD2Twov2s_POST: + case AArch64::LD2Twov4s_POST: + case AArch64::LD2Twov8b_POST: + case AArch64::LD2Rv1d_POST: + case AArch64::LD2Rv2s_POST: + case AArch64::LD2Rv4s_POST: + case AArch64::LD2Rv8b_POST: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d_POST: + case AArch64::LD1Twov4s_POST: + case AArch64::LD1Twov8h_POST: + case AArch64::LD1Twov16b_POST: + case AArch64::LD1Threev1d_POST: + case AArch64::LD1Threev2s_POST: + case AArch64::LD1Threev4h_POST: + case AArch64::LD1Threev8b_POST: + case AArch64::LD1Threev2d_POST: + case AArch64::LD1Threev4s_POST: + case AArch64::LD1Threev8h_POST: + case AArch64::LD1Threev16b_POST: + case AArch64::LD1Fourv1d_POST: + case AArch64::LD1Fourv2s_POST: + case AArch64::LD1Fourv4h_POST: + case AArch64::LD1Fourv8b_POST: + case AArch64::LD1Fourv2d_POST: + case AArch64::LD1Fourv4s_POST: + case AArch64::LD1Fourv8h_POST: + case AArch64::LD1Fourv16b_POST: + case AArch64::LD2Twov2d_POST: + case AArch64::LD2Twov4h_POST: + case AArch64::LD2Twov8h_POST: + case AArch64::LD2Twov16b_POST: + case AArch64::LD2Rv2d_POST: + case AArch64::LD2Rv4h_POST: + case AArch64::LD2Rv8h_POST: + case AArch64::LD2Rv16b_POST: + case AArch64::LD3Threev2s_POST: + case AArch64::LD3Threev4h_POST: + case AArch64::LD3Threev8b_POST: + case AArch64::LD3Threev2d_POST: + case AArch64::LD3Threev4s_POST: + case AArch64::LD3Threev8h_POST: + case AArch64::LD3Threev16b_POST: + case AArch64::LD3Rv1d_POST: + case AArch64::LD3Rv2s_POST: + case AArch64::LD3Rv4h_POST: + case AArch64::LD3Rv8b_POST: + case AArch64::LD3Rv2d_POST: + case AArch64::LD3Rv4s_POST: + case AArch64::LD3Rv8h_POST: + case AArch64::LD3Rv16b_POST: + case AArch64::LD4Fourv2s_POST: + case AArch64::LD4Fourv4h_POST: + case AArch64::LD4Fourv8b_POST: + case AArch64::LD4Fourv2d_POST: + case AArch64::LD4Fourv4s_POST: + case AArch64::LD4Fourv8h_POST: + case AArch64::LD4Fourv16b_POST: + case AArch64::LD4Rv1d_POST: + case AArch64::LD4Rv2s_POST: + case AArch64::LD4Rv4h_POST: + case AArch64::LD4Rv8b_POST: + case AArch64::LD4Rv2d_POST: + case AArch64::LD4Rv4s_POST: + case AArch64::LD4Rv8h_POST: + case AArch64::LD4Rv16b_POST: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBBui: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRBui: + case AArch64::LDRDl: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRDui: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHHui: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRHui: + case AArch64::LDRQl: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRQui: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBWui: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSBXui: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHWui: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSHXui: + case AArch64::LDRSWl: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSWui: + case AArch64::LDRSl: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRSui: + case AArch64::LDRWl: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRWui: + case AArch64::LDRXl: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::LDRXui: + case AArch64::LDURBBi: + case AArch64::LDURBi: + case AArch64::LDURDi: + case AArch64::LDURHHi: + case AArch64::LDURHi: + case AArch64::LDURQi: + case AArch64::LDURSBWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHWi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDURSi: + case AArch64::LDURWi: + case AArch64::LDURXi: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = 2; + IsPrePost = false; + break; + + case AArch64::LDRBBpost: + case AArch64::LDRBBpre: + case AArch64::LDRBpost: + case AArch64::LDRBpre: + case AArch64::LDRDpost: + case AArch64::LDRDpre: + case AArch64::LDRHHpost: + case AArch64::LDRHHpre: + case AArch64::LDRHpost: + case AArch64::LDRHpre: + case AArch64::LDRQpost: + case AArch64::LDRQpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSBWpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSBXpre: + case AArch64::LDRSHWpost: + case AArch64::LDRSHWpre: + case AArch64::LDRSHXpost: + case AArch64::LDRSHXpre: + case AArch64::LDRSWpost: + case AArch64::LDRSWpre: + case AArch64::LDRSpost: + case AArch64::LDRSpre: + case AArch64::LDRWpost: + case AArch64::LDRWpre: + case AArch64::LDRXpost: + case AArch64::LDRXpre: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = true; + break; + + case AArch64::LDPDi: + case AArch64::LDPQi: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPSWi: + case AArch64::LDPSi: + case AArch64::LDPWi: + case AArch64::LDPXi: + DestRegIdx = 0; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPQpost: + case AArch64::LDPQpre: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + + case AArch64::LDPDpost: + case AArch64::LDPDpre: + case AArch64::LDPSWpost: + case AArch64::LDPSWpre: + case AArch64::LDPSpost: + case AArch64::LDPSpre: + case AArch64::LDPWpost: + case AArch64::LDPWpre: + case AArch64::LDPXpost: + case AArch64::LDPXpre: + DestRegIdx = 1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + } + + LoadInfo LI; + LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg(); + LI.BaseReg = MI.getOperand(BaseRegIdx).getReg(); + LI.BaseRegIdx = BaseRegIdx; + LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); + LI.IsPrePost = IsPrePost; + return LI; +} + +static Optional<unsigned> getTag(const TargetRegisterInfo *TRI, + const MachineInstr &MI, const LoadInfo &LI) { + unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0; + unsigned Base = TRI->getEncodingValue(LI.BaseReg); + unsigned Off; + if (LI.OffsetOpnd == nullptr) + Off = 0; + else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() || + LI.OffsetOpnd->isCPI()) + return None; + else if (LI.OffsetOpnd->isReg()) + Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg()); + else + Off = LI.OffsetOpnd->getImm() >> 2; + + return makeTag(Dest, Base, Off); +} + +void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { + // Build the initial tag map for the whole loop. + TagMap.clear(); + for (MachineBasicBlock *MBB : L.getBlocks()) + for (MachineInstr &MI : *MBB) { + Optional<LoadInfo> LInfo = getLoadInfo(MI); + if (!LInfo) + continue; + Optional<unsigned> Tag = getTag(TRI, MI, *LInfo); + if (!Tag) + continue; + TagMap[*Tag].push_back(&MI); + } + + bool AnyCollisions = false; + for (auto &P : TagMap) { + auto Size = P.second.size(); + if (Size > 1) { + for (auto *MI : P.second) { + if (TII->isStridedAccess(*MI)) { + AnyCollisions = true; + break; + } + } + } + if (AnyCollisions) + break; + } + // Nothing to fix. + if (!AnyCollisions) + return; + + MachineRegisterInfo &MRI = Fn.getRegInfo(); + + // Go through all the basic blocks in the current loop and fix any streaming + // loads to avoid collisions with any other loads. + LiveRegUnits LR(*TRI); + for (MachineBasicBlock *MBB : L.getBlocks()) { + LR.clear(); + LR.addLiveOuts(*MBB); + for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) { + MachineInstr &MI = *I; + if (!TII->isStridedAccess(MI)) + continue; + + LoadInfo LdI = *getLoadInfo(MI); + unsigned OldTag = *getTag(TRI, MI, LdI); + auto &OldCollisions = TagMap[OldTag]; + if (OldCollisions.size() <= 1) + continue; + + bool Fixed = false; + DEBUG(dbgs() << "Attempting to fix tag collision: " << MI); + + for (unsigned ScratchReg : AArch64::GPR64RegClass) { + if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg)) + continue; + + LoadInfo NewLdI(LdI); + NewLdI.BaseReg = ScratchReg; + unsigned NewTag = *getTag(TRI, MI, NewLdI); + // Scratch reg tag would collide too, so don't use it. + if (TagMap.count(NewTag)) + continue; + + DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI) + << '\n'); + + // Rewrite: + // Xd = LOAD Xb, off + // to: + // Xc = MOV Xb + // Xd = LOAD Xc, off + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg) + .addReg(AArch64::XZR) + .addReg(LdI.BaseReg) + .addImm(0); + MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx); + BaseOpnd.setReg(ScratchReg); + + // If the load does a pre/post increment, then insert a MOV after as + // well to update the real base register. + if (LdI.IsPrePost) { + DEBUG(dbgs() << "Doing post MOV of incremented reg: " + << PrintReg(ScratchReg, TRI) << '\n'); + MI.getOperand(0).setReg( + ScratchReg); // Change tied operand pre/post update dest. + BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, + TII->get(AArch64::ORRXrs), LdI.BaseReg) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0); + } + + for (int I = 0, E = OldCollisions.size(); I != E; ++I) + if (OldCollisions[I] == &MI) { + std::swap(OldCollisions[I], OldCollisions[E - 1]); + OldCollisions.pop_back(); + break; + } + + // Update TagMap to reflect instruction changes to reduce the number + // of later MOVs to be inserted. This needs to be done after + // OldCollisions is updated since it may be relocated by this + // insertion. + TagMap[NewTag].push_back(&MI); + ++NumCollisionsAvoided; + Fixed = true; + Modified = true; + break; + } + if (!Fixed) + ++NumCollisionsNotAvoided; + } + } +} + +bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { + auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + if (ST.getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(*Fn.getFunction())) + return false; + + TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + + assert(TRI->trackLivenessAfterRegAlloc(Fn) && + "Register liveness not available!"); + + MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>(); + + Modified = false; + + for (MachineLoop *I : LI) + for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + // Only process inner-loops + if (L->empty()) + runOnLoop(**L, Fn); + + return Modified; +} + +FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index fe2c2d4..9739605 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -15,28 +15,62 @@ #include "AArch64.h" #include "AArch64CallingConvention.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> + using namespace llvm; namespace { @@ -50,48 +84,55 @@ class AArch64FastISel final : public FastISel { } BaseKind; private: - BaseKind Kind; - AArch64_AM::ShiftExtendType ExtType; + BaseKind Kind = RegBase; + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend; union { unsigned Reg; int FI; } Base; - unsigned OffsetReg; - unsigned Shift; - int64_t Offset; - const GlobalValue *GV; + unsigned OffsetReg = 0; + unsigned Shift = 0; + int64_t Offset = 0; + const GlobalValue *GV = nullptr; public: - Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend), - OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; } + Address() { Base.Reg = 0; } + void setKind(BaseKind K) { Kind = K; } BaseKind getKind() const { return Kind; } void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; } AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; } bool isRegBase() const { return Kind == RegBase; } bool isFIBase() const { return Kind == FrameIndexBase; } + void setReg(unsigned Reg) { assert(isRegBase() && "Invalid base register access!"); Base.Reg = Reg; } + unsigned getReg() const { assert(isRegBase() && "Invalid base register access!"); return Base.Reg; } + void setOffsetReg(unsigned Reg) { OffsetReg = Reg; } + unsigned getOffsetReg() const { return OffsetReg; } + void setFI(unsigned FI) { assert(isFIBase() && "Invalid base frame index access!"); Base.FI = FI; } + unsigned getFI() const { assert(isFIBase() && "Invalid base frame index access!"); return Base.FI; } + void setOffset(int64_t O) { Offset = O; } int64_t getOffset() { return Offset; } void setShift(unsigned S) { Shift = S; } @@ -417,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { // MachO still uses GOT for large code-model accesses, but ELF requires // movz/movk sequences, which FastISel doesn't handle yet. - if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO()) + if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO()) return 0; unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); @@ -531,23 +572,23 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) switch (Opcode) { default: break; - case Instruction::BitCast: { + case Instruction::BitCast: // Look through bitcasts. return computeAddress(U->getOperand(0), Addr, Ty); - } - case Instruction::IntToPtr: { + + case Instruction::IntToPtr: // Look past no-op inttoptrs. if (TLI.getValueType(DL, U->getOperand(0)->getType()) == TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; - } - case Instruction::PtrToInt: { + + case Instruction::PtrToInt: // Look past no-op ptrtoints. if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) return computeAddress(U->getOperand(0), Addr, Ty); break; - } + case Instruction::GetElementPtr: { Address SavedAddr = Addr; uint64_t TmpOffset = Addr.getOffset(); @@ -563,7 +604,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) TmpOffset += SL->getElementOffset(Idx); } else { uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); - for (;;) { + while (true) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. TmpOffset += CI->getSExtValue() * S; @@ -1241,6 +1282,10 @@ unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, bool WantResult) { assert(LHSReg && RHSReg && "Invalid register number."); + if (LHSReg == AArch64::SP || LHSReg == AArch64::WSP || + RHSReg == AArch64::SP || RHSReg == AArch64::WSP) + return 0; + if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; @@ -1321,6 +1366,8 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, uint64_t ShiftImm, bool SetFlags, bool WantResult) { assert(LHSReg && RHSReg && "Invalid register number."); + assert(LHSReg != AArch64::SP && LHSReg != AArch64::WSP && + RHSReg != AArch64::SP && RHSReg != AArch64::WSP); if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; @@ -1362,6 +1409,8 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, uint64_t ShiftImm, bool SetFlags, bool WantResult) { assert(LHSReg && RHSReg && "Invalid register number."); + assert(LHSReg != AArch64::XZR && LHSReg != AArch64::WZR && + RHSReg != AArch64::XZR && RHSReg != AArch64::WZR); if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; @@ -2065,7 +2114,7 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type."); - case MVT::i1: VTIsi1 = true; + case MVT::i1: VTIsi1 = true; LLVM_FALLTHROUGH; case MVT::i8: Opc = OpcTable[Idx][0]; break; case MVT::i16: Opc = OpcTable[Idx][1]; break; case MVT::i32: Opc = OpcTable[Idx][2]; break; @@ -2786,7 +2835,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { return false; EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); - if (SrcVT == MVT::f128) + if (SrcVT == MVT::f128 || SrcVT == MVT::f16) return false; unsigned Opc; @@ -2813,8 +2862,12 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { MVT DestVT; if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) return false; - assert ((DestVT == MVT::f32 || DestVT == MVT::f64) && - "Unexpected value type."); + // Let regular ISEL handle FP16 + if (DestVT == MVT::f16) + return false; + + assert((DestVT == MVT::f32 || DestVT == MVT::f64) && + "Unexpected value type."); unsigned SrcReg = getRegForValue(I->getOperand(0)); if (!SrcReg) @@ -2866,16 +2919,13 @@ bool AArch64FastISel::fastLowerArguments() { // Only handle simple cases of up to 8 GPR and FPR each. unsigned GPRCnt = 0; unsigned FPRCnt = 0; - unsigned Idx = 0; for (auto const &Arg : F->args()) { - // The first argument is at index 1. - ++Idx; - if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || - F->getAttributes().hasAttribute(Idx, Attribute::InReg) || - F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || - F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) || - F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) || - F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + if (Arg.hasAttribute(Attribute::ByVal) || + Arg.hasAttribute(Attribute::InReg) || + Arg.hasAttribute(Attribute::StructRet) || + Arg.hasAttribute(Attribute::SwiftSelf) || + Arg.hasAttribute(Attribute::SwiftError) || + Arg.hasAttribute(Attribute::Nest)) return false; Type *ArgTy = Arg.getType(); @@ -2976,7 +3026,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Process the args. for (CCValAssign &VA : ArgLocs) { @@ -3106,8 +3156,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; CodeModel::Model CM = TM.getCodeModel(); - // Only support the small and large code model. - if (CM != CodeModel::Small && CM != CodeModel::Large) + // Only support the small-addressing and large code models. + if (CM != CodeModel::Large && !Subtarget->useSmallAddressing()) return false; // FIXME: Add large code model support for ELF. @@ -3158,7 +3208,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue the call. MachineInstrBuilder MIB; - if (CM == CodeModel::Small) { + if (Subtarget->useSmallAddressing()) { const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); if (Symbol) @@ -3369,8 +3419,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo(); MFI.setFrameAddressIsTaken(true); - const AArch64RegisterInfo *RegInfo = - static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo()); + const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3521,11 +3570,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { updateValueMap(II, ResultReg); return true; } - case Intrinsic::trap: { + case Intrinsic::trap: BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); return true; - } + case Intrinsic::sqrt: { Type *RetTy = II->getCalledFunction()->getReturnType(); @@ -5089,11 +5138,14 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; + (void)&CC_AArch64_Win64_VarArg; } namespace llvm { -llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, + +FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) { return new AArch64FastISel(FuncInfo, LibInfo); } -} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index f5b8c35..7c6a999 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -41,6 +41,10 @@ // | | // |-----------------------------------| // | | +// | (Win64 only) varargs from reg | +// | | +// |-----------------------------------| +// | | // | prev_fp, prev_lr | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) @@ -90,21 +94,42 @@ #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <vector> using namespace llvm; @@ -116,6 +141,34 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone", STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +/// Look at each instruction that references stack frames and return the stack +/// size limit beyond which some of these instructions will require a scratch +/// register during their expansion later. +static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { + // FIXME: For now, just conservatively guestimate based on unscaled indexing + // range. We'll end up allocating an unnecessary spill slot a lot, but + // realistically that's not a big deal at this stage of the game. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.isDebugValue() || MI.isPseudo() || + MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::ADDSXri) + continue; + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) + continue; + + int Offset = 0; + if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == + AArch64FrameOffsetCannotUpdate) + return 0; + } + } + } + return 255; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -245,14 +298,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { if (&MF->front() == MBB) return AArch64::X9; - const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); - LivePhysRegs LiveRegs(&TRI); + const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); LiveRegs.addLiveIns(*MBB); // Mark callee saved registers as used so we will not choose them. - const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>(); - const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF); + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); for (unsigned i = 0; CSRegs[i]; ++i) LiveRegs.addReg(CSRegs[i]); @@ -319,7 +371,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { - unsigned NewOpc; bool NewIsUnscaled = false; switch (MBBI->getOpcode()) { @@ -362,7 +413,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( unsigned OpndIdx = 0; for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; ++OpndIdx) - MIB.addOperand(MBBI->getOperand(OpndIdx)); + MIB.add(MBBI->getOperand(OpndIdx)); assert(MBBI->getOperand(OpndIdx).getImm() == 0 && "Unexpected immediate offset in first/last callee-save save/restore " @@ -455,19 +506,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, return; } - auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes - CSStackSize); + AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); NumBytes = 0; - } else if (CSStackSize != 0) { + } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, - -CSStackSize); - NumBytes -= CSStackSize; + -PrologueSaveSize); + NumBytes -= PrologueSaveSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -481,8 +536,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } if (HasFP) { - // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. - int FPOffset = CSStackSize - 16; + // Only set up FP if we actually need to. Frame pointer is fp = + // sp - fixedobject - 16. + int FPOffset = AFI->getCalleeSavedStackSize() - 16; if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -621,8 +677,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + nullptr, Reg, 2 * StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -708,12 +764,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. - auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); - if (!CombineSPBump && CSStackSize != 0) + if (!CombineSPBump && PrologueSaveSize != 0) convertCalleeSaveRestoreToSPPrePostIncDec( - MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize); // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); @@ -735,7 +795,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, return; } - NumBytes -= CSStackSize; + NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -745,7 +805,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (RedZone && ArgumentPopSize == 0) return; - bool NoCalleeSaveRestore = CSStackSize == 0; + bool NoCalleeSaveRestore = PrologueSaveSize == 0; int StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += ArgumentPopSize; @@ -764,7 +824,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (MFI.hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -CSStackSize + 16, TII, MachineInstr::FrameDestroy); + -AFI->getCalleeSavedStackSize() + 16, TII, + MachineInstr::FrameDestroy); else if (NumBytes) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, MachineInstr::FrameDestroy); @@ -794,7 +855,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - int FPOffset = MFI.getObjectOffset(FI) + 16; + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + bool IsWin64 = + Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16; int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize(); bool isFixed = MFI.isFixedObjectIndex(FI); @@ -863,22 +928,26 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { static bool produceCompactUnwindFrame(MachineFunction &MF) { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - AttributeSet Attrs = MF.getFunction()->getAttributes(); + AttributeList Attrs = MF.getFunction()->getAttributes(); return Subtarget.isTargetMachO() && !(Subtarget.getTargetLowering()->supportSwiftError() && Attrs.hasAttrSomewhere(Attribute::SwiftError)); } namespace { + struct RegPairInfo { - RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {} - unsigned Reg1; - unsigned Reg2; + unsigned Reg1 = AArch64::NoRegister; + unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; bool IsGPR; + + RegPairInfo() = default; + bool isPaired() const { return Reg2 != AArch64::NoRegister; } }; + } // end anonymous namespace static void computeCalleeSaveRegisterPairs( @@ -899,7 +968,7 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - unsigned Offset = AFI->getCalleeSavedStackSize(); + int Offset = AFI->getCalleeSavedStackSize(); for (unsigned i = 0; i < Count; ++i) { RegPairInfo RPI; @@ -968,6 +1037,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( SmallVector<RegPairInfo, 8> RegPairs; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { @@ -999,9 +1069,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( dbgs() << ")\n"); MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - MBB.addLiveIn(Reg1); + if (!MRI.isReserved(Reg1)) + MBB.addLiveIn(Reg1); if (RPI.isPaired()) { - MBB.addLiveIn(Reg2); + if (!MRI.isReserved(Reg2)) + MBB.addLiveIn(Reg2); MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), @@ -1102,7 +1174,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (RegInfo->hasBasePointer(MF)) BasePointerReg = RegInfo->getBaseRegister(); - bool ExtraCSSpill = false; + unsigned ExtraCSSpill = 0; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1130,13 +1202,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && !RegInfo->isReservedReg(MF, PairedReg)) - ExtraCSSpill = true; + ExtraCSSpill = PairedReg; } } DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; - for (int Reg = SavedRegs.find_first(); Reg != -1; - Reg = SavedRegs.find_next(Reg)) + for (unsigned Reg : SavedRegs.set_bits()) dbgs() << ' ' << PrintReg(Reg, RegInfo); dbgs() << "\n";); @@ -1144,16 +1215,13 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned NumRegsSpilled = SavedRegs.count(); bool CanEliminateFrame = NumRegsSpilled == 0; - // FIXME: Set BigStack if any stack slot references may be out of range. - // For now, just conservatively guestimate based on unscaled indexing - // range. We'll end up allocating an unnecessary spill slot a lot, but - // realistically that's not a big deal at this stage of the game. // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); - bool BigStack = (CFSize >= 256); + unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); + bool BigStack = (CFSize > EstimatedStackSizeLimit); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -1163,8 +1231,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // register scavenging. If we already spilled an extra callee-saved register // above to keep the number of spills even, we don't need to do anything else // here. - if (BigStack && !ExtraCSSpill) { - if (UnspilledCSGPR != AArch64::NoRegister) { + if (BigStack) { + if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) << " to get a scratch register.\n"); SavedRegs.set(UnspilledCSGPR); @@ -1173,15 +1241,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = true; + ExtraCSSpill = UnspilledCSGPRPaired; NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create // an emergency spill slot. - if (!ExtraCSSpill) { - const TargetRegisterClass *RC = &AArch64::GPR64RegClass; - int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false); + if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass &RC = AArch64::GPR64RegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + int FI = MFI.CreateStackObject(Size, Align, false); RS->addScavengingFrameIndex(FI); DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def index d472a54..8b1c974 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def +++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def @@ -16,281 +16,198 @@ #endif namespace llvm { -namespace AArch64 { - -const uint32_t GPRCoverageData[] = { - // Classes 0-31 - (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) | - (1u << AArch64::GPR32spRegClassID) | - (1u << AArch64::GPR32commonRegClassID) | - (1u << AArch64::GPR32sponlyRegClassID) | - (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) | - (1u << AArch64::GPR64spRegClassID) | - (1u << AArch64::GPR64commonRegClassID) | - (1u << AArch64::tcGPR64RegClassID) | - (1u << AArch64::GPR64sponlyRegClassID), - // Classes 32-63 - 0, - // FIXME: The entries below this point can be safely removed once this is - // tablegenerated. It's only needed because of the hardcoded register class - // limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -const uint32_t FPRCoverageData[] = { - // Classes 0-31 - (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) | - (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) | - (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) | - (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) | - (1u << AArch64::DDDDRegClassID), - // Classes 32-63 - (1u << (AArch64::QQRegClassID - 32)) | - (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u << (AArch64::QQQQRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID - - 32)) | - (1u - << (AArch64:: - QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID - - 32)) | - (1u << (AArch64::QQQRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) | - (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) | - (1u - << (AArch64:: - QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID - - 32)), - // FIXME: The entries below this point can be safely removed once this - // is tablegenerated. It's only needed because of the hardcoded register - // class limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -const uint32_t CCRCoverageData[] = { - // Classes 0-31 - 1u << AArch64::CCRRegClassID, - // Classes 32-63 - 0, - // FIXME: The entries below this point can be safely removed once this - // is tablegenerated. It's only needed because of the hardcoded register - // class limit. - // Classes 64-96 - 0, - // Classes 97-128 - 0, - // Classes 129-160 - 0, - // Classes 161-192 - 0, - // Classes 193-224 - 0, -}; - -RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData); -RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData); -RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData); - -RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank}; - -// PartialMappings. -enum PartialMappingIdx { - PMI_None = -1, - PMI_GPR32 = 1, - PMI_GPR64, - PMI_FPR32, - PMI_FPR64, - PMI_FPR128, - PMI_FPR256, - PMI_FPR512, - PMI_FirstGPR = PMI_GPR32, - PMI_LastGPR = PMI_GPR64, - PMI_FirstFPR = PMI_FPR32, - PMI_LastFPR = PMI_FPR512, - PMI_Min = PMI_FirstGPR, -}; - -static unsigned getRegBankBaseIdxOffset(unsigned Size) { - assert(Size && "0-sized type!!"); - // Make anything smaller than 32 gets 32 - Size = ((Size + 31) / 32) * 32; - // 32 is 0, 64 is 1, 128 is 2, and so on. - return Log2_32(Size) - /*Log2_32(32)=*/ 5; -} - -RegisterBankInfo::PartialMapping PartMappings[] { - /* StartIdx, Length, RegBank */ - // 0: GPR 32-bit value. - {0, 32, GPRRegBank}, - // 1: GPR 64-bit value. - {0, 64, GPRRegBank}, - // 2: FPR 32-bit value. - {0, 32, FPRRegBank}, - // 3: FPR 64-bit value. - {0, 64, FPRRegBank}, - // 4: FPR 128-bit value. - {0, 128, FPRRegBank}, - // 5: FPR 256-bit value. - {0, 256, FPRRegBank}, - // 6: FPR 512-bit value. - {0, 512, FPRRegBank} -}; - -enum ValueMappingIdx { - First3OpsIdx = 0, - Last3OpsIdx = 18, - DistanceBetweenRegBanks = 3, - FirstCrossRegCpyIdx = 21, - LastCrossRegCpyIdx = 27, - DistanceBetweenCrossRegCpy = 2 +RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ + /* StartIdx, Length, RegBank */ + // 0: FPR 32-bit value. + {0, 32, AArch64::FPRRegBank}, + // 1: FPR 64-bit value. + {0, 64, AArch64::FPRRegBank}, + // 2: FPR 128-bit value. + {0, 128, AArch64::FPRRegBank}, + // 3: FPR 256-bit value. + {0, 256, AArch64::FPRRegBank}, + // 4: FPR 512-bit value. + {0, 512, AArch64::FPRRegBank}, + // 5: GPR 32-bit value. + {0, 32, AArch64::GPRRegBank}, + // 6: GPR 64-bit value. + {0, 64, AArch64::GPRRegBank}, }; // ValueMappings. -RegisterBankInfo::ValueMapping ValMappings[]{ +RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ /* BreakDown, NumBreakDowns */ + // 0: invalid + {nullptr, 0}, // 3-operands instructions (all binary operations should end up with one of // those mapping). - // 0: GPR 32-bit value. <-- This must match First3OpsIdx. - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 3: GPR 64-bit value. - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - // 6: FPR 32-bit value. - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 9: FPR 64-bit value. - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 12: FPR 128-bit value. - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - {&PartMappings[PMI_FPR128 - PMI_Min], 1}, - // 15: FPR 256-bit value. - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - {&PartMappings[PMI_FPR256 - PMI_Min], 1}, - // 18: FPR 512-bit value. <-- This must match Last3OpsIdx. - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, - {&PartMappings[PMI_FPR512 - PMI_Min], 1}, + // 1: FPR 32-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 4: FPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 7: FPR 128-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + // 10: FPR 256-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + // 13: FPR 512-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + // 16: GPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 19: GPR 64-bit value. <-- This must match Last3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, // Cross register bank copies. - // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match + // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match // FirstCrossRegCpyIdx. - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - // 23: GPR 64-bit value to FPR 64-bit value. - {&PartMappings[PMI_GPR64 - PMI_Min], 1}, - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - // 25: FPR 32-bit value to GPR 32-bit value. - {&PartMappings[PMI_FPR32 - PMI_Min], 1}, - {&PartMappings[PMI_GPR32 - PMI_Min], 1}, - // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 24: FPR 64-bit value to GPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + // 26: FPR 128-bit value to GPR 128-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 28: FPR 256-bit value to GPR 256-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 30: FPR 512-bit value to GPR 512-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 32: GPR 32-bit value to FPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match // LastCrossRegCpyIdx. - {&PartMappings[PMI_FPR64 - PMI_Min], 1}, - {&PartMappings[PMI_GPR64 - PMI_Min], 1} + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, }; -/// Get the pointer to the ValueMapping representing the RegisterBank -/// at \p RBIdx with a size of \p Size. -/// -/// The returned mapping works for instructions with the same kind of -/// operands for up to 3 operands. -/// -/// \pre \p RBIdx != PartialMappingIdx::None +bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, + unsigned ValStartIdx, + unsigned ValLength, + const RegisterBank &RB) { + const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min]; + return Map.StartIdx == ValStartIdx && Map.Length == ValLength && + Map.RegBank == &RB; +} + +bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx, + unsigned FirstInBank, + unsigned Size, + unsigned Offset) { + unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min; + const ValueMapping &Map = + AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset]; + return Map.BreakDown == &PartMappings[PartialMapBaseIdx] && + Map.NumBreakDowns == 1; +} + +bool AArch64GenRegisterBankInfo::checkPartialMappingIdx( + PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias, + ArrayRef<PartialMappingIdx> Order) { + if (Order.front() != FirstAlias) + return false; + if (Order.back() != LastAlias) + return false; + if (Order.front() > Order.back()) + return false; + + PartialMappingIdx Previous = Order.front(); + bool First = true; + for (const auto &Current : Order) { + if (First) { + First = false; + continue; + } + if (Previous + 1 != Current) + return false; + Previous = Current; + } + return true; +} + +unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, + unsigned Size) { + if (RBIdx == PMI_FirstGPR) { + if (Size <= 32) + return 0; + if (Size <= 64) + return 1; + return -1; + } + if (RBIdx == PMI_FirstFPR) { + if (Size <= 32) + return 0; + if (Size <= 64) + return 1; + if (Size <= 128) + return 2; + if (Size <= 256) + return 3; + if (Size <= 512) + return 4; + return -1; + } + return -1; +} + const RegisterBankInfo::ValueMapping * -getValueMapping(PartialMappingIdx RBIdx, unsigned Size) { +AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx, + unsigned Size) { assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that"); - unsigned ValMappingIdx = First3OpsIdx + - (RBIdx - AArch64::PartialMappingIdx::PMI_Min + - getRegBankBaseIdxOffset(Size)) * - ValueMappingIdx::DistanceBetweenRegBanks; - assert(ValMappingIdx >= AArch64::First3OpsIdx && - ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound"); + unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size); + if (BaseIdxOffset == -1u) + return &ValMappings[InvalidIdx]; + + unsigned ValMappingIdx = + First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) * + ValueMappingIdx::DistanceBetweenRegBanks; + assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx && + "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } -/// Get the pointer to the ValueMapping of the operands of a copy -/// instruction from a GPR or FPR register to a GPR or FPR register -/// with a size of \p Size. -/// -/// If \p DstIsGPR is true, the destination of the copy is on GPR, -/// otherwise it is on FPR. Same thing for \p SrcIsGPR. +AArch64GenRegisterBankInfo::PartialMappingIdx + AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{ + PMI_None, // CCR + PMI_FirstFPR, // FPR + PMI_FirstGPR, // GPR + }; + const RegisterBankInfo::ValueMapping * -getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) { - PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR; - PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR; +AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, + unsigned SrcBankID, unsigned Size) { + assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID]; + PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID]; + assert(DstRBIdx != PMI_None && "No such mapping"); + assert(SrcRBIdx != PMI_None && "No such mapping"); + if (DstRBIdx == SrcRBIdx) return getValueMapping(DstRBIdx, Size); + assert(Size <= 64 && "GPR cannot handle that size"); unsigned ValMappingIdx = FirstCrossRegCpyIdx + - (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) * + (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) * ValueMappingIdx::DistanceBetweenCrossRegCpy; - assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx && - ValMappingIdx <= AArch64::LastCrossRegCpyIdx && - "Mapping out of bound"); + assert(ValMappingIdx >= FirstCrossRegCpyIdx && + ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); return &ValMappings[ValMappingIdx]; } - -} // End AArch64 namespace. } // End llvm namespace. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 3099383..06005f6 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -200,7 +201,7 @@ private: bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); - void SelectCMP_SWAP(SDNode *N); + bool SelectCMP_SWAP(SDNode *N); }; } // end anonymous namespace @@ -238,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: case InlineAsm::Constraint_Q: - // Require the address to be in a register. That is safe for all AArch64 - // variants and it is hard to do anything much smarter without knowing - // how the operand is used. - OutOps.push_back(Op); + // We need to make sure that this one operand does not end up in XZR, thus + // require the address to be in a PointerRegClass register. + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF); + SDLoc dl(Op); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64); + SDValue NewOp = + SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op.getValueType(), + Op, RC), 0); + OutOps.push_back(NewOp); return false; } return true; @@ -328,11 +336,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { } } +/// \brief Determine whether it is worth it to fold SHL into the addressing +/// mode. +static bool isWorthFoldingSHL(SDValue V) { + assert(V.getOpcode() == ISD::SHL && "invalid opcode"); + // It is worth folding logical shift of up to three places. + auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (!CSD) + return false; + unsigned ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 3) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = V.getNode(); + for (SDNode *UI : Node->uses()) + if (!isa<MemSDNode>(*UI)) + for (SDNode *UII : UI->uses()) + if (!isa<MemSDNode>(*UII)) + return false; + return true; +} + /// \brief Determine whether it is worth to fold V into an extended register. bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { - // it hurts if the value is used at least twice, unless we are optimizing - // for code size. - return ForCodeSize || V.hasOneUse(); + // Trivial if we are optimizing for code size or if there is only + // one use of the value. + if (ForCodeSize || V.hasOneUse()) + return true; + // If a subtarget has a fastpath LSL we can fold a logical shift into + // the addressing mode and save a cycle. + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL && + isWorthFoldingSHL(V)) + return true; + if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) { + const SDValue LHS = V.getOperand(0); + const SDValue RHS = V.getOperand(1); + if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS)) + return true; + if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS)) + return true; + } + + // It hurts otherwise, since the value will be reused. + return false; } /// SelectShiftedRegister - Select a "shifted register" operand. If the value @@ -1811,20 +1860,20 @@ static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits, OpUsefulBits = 1; if (MSB >= Imm) { - OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); + OpUsefulBits <<= MSB - Imm + 1; --OpUsefulBits; // The interesting part will be in the lower part of the result getUsefulBits(Op, OpUsefulBits, Depth + 1); // The interesting part was starting at Imm in the argument - OpUsefulBits = OpUsefulBits.shl(Imm); + OpUsefulBits <<= Imm; } else { - OpUsefulBits = OpUsefulBits.shl(MSB + 1); + OpUsefulBits <<= MSB + 1; --OpUsefulBits; // The interesting part will be shifted in the result - OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm); + OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm; getUsefulBits(Op, OpUsefulBits, Depth + 1); // The interesting part was at zero in the argument - OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm); + OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm); } UsefulBits &= OpUsefulBits; @@ -1851,17 +1900,17 @@ static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits, if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) { // Shift Left uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); - Mask = Mask.shl(ShiftAmt); + Mask <<= ShiftAmt; getUsefulBits(Op, Mask, Depth + 1); - Mask = Mask.lshr(ShiftAmt); + Mask.lshrInPlace(ShiftAmt); } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) { // Shift Right // We do not handle AArch64_AM::ASR, because the sign will change the // number of useful bits uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); - Mask = Mask.lshr(ShiftAmt); + Mask.lshrInPlace(ShiftAmt); getUsefulBits(Op, Mask, Depth + 1); - Mask = Mask.shl(ShiftAmt); + Mask <<= ShiftAmt; } else return; @@ -1889,13 +1938,13 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, uint64_t Width = MSB - Imm + 1; uint64_t LSB = Imm; - OpUsefulBits = OpUsefulBits.shl(Width); + OpUsefulBits <<= Width; --OpUsefulBits; if (Op.getOperand(1) == Orig) { // Copy the low bits from the result to bits starting from LSB. Mask = ResultUsefulBits & OpUsefulBits; - Mask = Mask.shl(LSB); + Mask <<= LSB; } if (Op.getOperand(0) == Orig) @@ -1906,14 +1955,14 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, uint64_t Width = MSB + 1; uint64_t LSB = UsefulBits.getBitWidth() - Imm; - OpUsefulBits = OpUsefulBits.shl(Width); + OpUsefulBits <<= Width; --OpUsefulBits; - OpUsefulBits = OpUsefulBits.shl(LSB); + OpUsefulBits <<= LSB; if (Op.getOperand(1) == Orig) { // Copy the bits from the result to the zero bits. Mask = ResultUsefulBits & OpUsefulBits; - Mask = Mask.lshr(LSB); + Mask.lshrInPlace(LSB); } if (Op.getOperand(0) == Orig) @@ -2037,18 +2086,18 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, (void)BitWidth; assert(BitWidth == 32 || BitWidth == 64); - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(Op, KnownZero, KnownOne); + KnownBits Known; + CurDAG->computeKnownBits(Op, Known); // Non-zero in the sense that they're not provably zero, which is the key // point if we want to use this value - uint64_t NonZeroBits = (~KnownZero).getZExtValue(); + uint64_t NonZeroBits = (~Known.Zero).getZExtValue(); // Discard a constant AND mask if present. It's safe because the node will // already have been factored into the computeKnownBits calculation above. uint64_t AndImm; if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) { - assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0); + assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0); Op = Op.getOperand(0); } @@ -2117,15 +2166,15 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) { // Compute the Known Zero for the AND as this allows us to catch more general // cases than just looking for AND with imm. - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(And, KnownZero, KnownOne); + KnownBits Known; + CurDAG->computeKnownBits(And, Known); // Non-zero in the sense that they're not provably zero, which is the key // point if we want to use this value. - uint64_t NotKnownZero = (~KnownZero).getZExtValue(); + uint64_t NotKnownZero = (~Known.Zero).getZExtValue(); // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). - if (!isShiftedMask(KnownZero.getZExtValue(), VT)) + if (!isShiftedMask(Known.Zero.getZExtValue(), VT)) return false; // The bits being inserted must only set those bits that are known to be zero. @@ -2259,15 +2308,15 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits, // This allows to catch more general case than just looking for // AND with imm. Indeed, simplify-demanded-bits may have removed // the AND instruction because it proves it was useless. - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne); + KnownBits Known; + CurDAG->computeKnownBits(OrOpd1Val, Known); // Check if there is enough room for the second operand to appear // in the first one APInt BitsToBeInserted = - APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width); + APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width); - if ((BitsToBeInserted & ~KnownZero) != 0) + if ((BitsToBeInserted & ~Known.Zero) != 0) continue; // Set the first operand @@ -2524,7 +2573,7 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) { // pstatefield for the MSR (immediate) instruction, we also require that an // immediate value has been provided as an argument, we know that this is // the case as it has been ensured by semantic checking. - auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());; + auto PMapper = AArch64PState::lookupPStateByName(RegString->getString()); if (PMapper) { assert (isa<ConstantSDNode>(N->getOperand(2)) && "Expected a constant integer expression."); @@ -2567,9 +2616,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) { } /// We've got special pseudo-instructions for these -void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { +bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { unsigned Opcode; EVT MemTy = cast<MemSDNode>(N)->getMemoryVT(); + + // Leave IR for LSE if subtarget supports it. + if (Subtarget->hasLSE()) return false; + if (MemTy == MVT::i8) Opcode = AArch64::CMP_SWAP_8; else if (MemTy == MVT::i16) @@ -2595,6 +2648,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0)); ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2)); CurDAG->RemoveDeadNode(N); + + return true; } void AArch64DAGToDAGISel::Select(SDNode *Node) { @@ -2618,8 +2673,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { break; case ISD::ATOMIC_CMP_SWAP: - SelectCMP_SWAP(Node); - return; + if (SelectCMP_SWAP(Node)) + return; + break; case ISD::READ_REGISTER: if (tryReadRegister(Node)) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 849058b..9d87988 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" -#include "AArch64ISelLowering.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" @@ -22,13 +22,14 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -50,10 +51,10 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" @@ -66,6 +67,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetCallingConv.h" @@ -90,6 +92,7 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt<bool> EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, @@ -104,6 +107,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +static cl::opt<bool> +EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, + cl::desc("Enable AArch64 logical imm instruction " + "optimization"), + cl::init(true)); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -372,7 +381,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FPOW, MVT::v4f16, Expand); - setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); setOperationAction(ISD::FREM, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); @@ -404,7 +412,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FPOW, MVT::v8f16, Expand); - setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); setOperationAction(ISD::FREM, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); @@ -544,7 +551,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -554,8 +560,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setSchedulingPreference(Sched::Hybrid); - // Enable TBZ/TBNZ - MaskAndBranchFoldingIsLegal = true; EnableExtLdPromotion = true; // Set required alignment. @@ -652,6 +656,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Vector reductions + for (MVT VT : MVT::integer_valuetypes()) { + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } + for (MVT VT : MVT::fp_valuetypes()) { + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + } + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -707,7 +724,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); @@ -751,6 +767,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); + if (!VT.isFloatingPoint()) + setOperationAction(ISD::ABS, VT, Legal); + // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) @@ -788,21 +807,157 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, + const APInt &Demanded, + TargetLowering::TargetLoweringOpt &TLO, + unsigned NewOpc) { + uint64_t OldImm = Imm, NewImm, Enc; + uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; + + // Return if the immediate is already all zeros, all ones, a bimm32 or a + // bimm64. + if (Imm == 0 || Imm == Mask || + AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) + return false; + + unsigned EltSize = Size; + uint64_t DemandedBits = Demanded.getZExtValue(); + + // Clear bits that are not demanded. + Imm &= DemandedBits; + + while (true) { + // The goal here is to set the non-demanded bits in a way that minimizes + // the number of switching between 0 and 1. In order to achieve this goal, + // we set the non-demanded bits to the value of the preceding demanded bits. + // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a + // non-demanded bit), we copy bit0 (1) to the least significant 'x', + // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. + // The final result is 0b11000011. + uint64_t NonDemandedBits = ~DemandedBits; + uint64_t InvertedImm = ~Imm & DemandedBits; + uint64_t RotatedImm = + ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & + NonDemandedBits; + uint64_t Sum = RotatedImm + NonDemandedBits; + bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); + uint64_t Ones = (Sum + Carry) & NonDemandedBits; + NewImm = (Imm | Ones) & Mask; + + // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate + // or all-ones or all-zeros, in which case we can stop searching. Otherwise, + // we halve the element size and continue the search. + if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) + break; + + // We cannot shrink the element size any further if it is 2-bits. + if (EltSize == 2) + return false; + + EltSize /= 2; + Mask >>= EltSize; + uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; + + // Return if there is mismatch in any of the demanded bits of Imm and Hi. + if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) + return false; + + // Merge the upper and lower halves of Imm and DemandedBits. + Imm |= Hi; + DemandedBits |= DemandedBitsHi; + } + + ++NumOptimizedImms; + + // Replicate the element across the register width. + while (EltSize < Size) { + NewImm |= NewImm << EltSize; + EltSize *= 2; + } + + (void)OldImm; + assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && + "demanded bits should never be altered"); + assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); + + // Create the new constant immediate node. + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue New; + + // If the new constant immediate is all-zeros or all-ones, let the target + // independent DAG combine optimize this node. + if (NewImm == 0 || NewImm == OrigMask) { + New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), + TLO.DAG.getConstant(NewImm, DL, VT)); + // Otherwise, create a machine node so that target independent DAG combine + // doesn't undo this optimization. + } else { + Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); + SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); + New = SDValue( + TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); + } + + return TLO.CombineTo(Op, New); +} + +bool AArch64TargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { + // Delay this optimization to as late as possible. + if (!TLO.LegalOps) + return false; + + if (!EnableOptimizeLogicalImm) + return false; + + EVT VT = Op.getValueType(); + if (VT.isVector()) + return false; + + unsigned Size = VT.getSizeInBits(); + assert((Size == 32 || Size == 64) && + "i32 or i64 is expected after legalization."); + + // Exit early if we demand all bits. + if (Demanded.countPopulation() == Size) + return false; + + unsigned NewOpc; + switch (Op.getOpcode()) { + default: + return false; + case ISD::AND: + NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; + break; + case ISD::OR: + NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; + break; + case ISD::XOR: + NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; + break; + } + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!C) + return false; + uint64_t Imm = C->getZExtValue(); + return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); +} + /// computeKnownBitsForTargetNode - Determine which of the bits specified in -/// Mask are known to be either zero or one and return them in the -/// KnownZero/KnownOne bitsets. +/// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, APInt &KnownZero, APInt &KnownOne, - const SelectionDAG &DAG, unsigned Depth) const { + const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::CSEL: { - APInt KnownZero2, KnownOne2; - DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); - DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; + KnownBits Known2; + DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; break; } case ISD::INTRINSIC_W_CHAIN: { @@ -812,10 +967,10 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { - unsigned BitWidth = KnownOne.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } @@ -834,15 +989,15 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); - KnownZero |= Mask; + Known.Zero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); - KnownZero |= Mask; + Known.Zero |= Mask; } break; } break; @@ -2113,8 +2268,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); const char *LibcallName = @@ -2122,10 +2277,11 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); return CallResult.first; @@ -2231,19 +2387,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::SIGN_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, true)) - return true; - return false; + return N->getOpcode() == ISD::SIGN_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { - if (N->getOpcode() == ISD::ZERO_EXTEND) - return true; - if (isExtendedBUILD_VECTOR(N, DAG, false)) - return true; - return false; + return N->getOpcode() == ISD::ZERO_EXTEND || + isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { @@ -2347,6 +2497,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } + case Intrinsic::aarch64_neon_abs: + return DAG.getNode(ISD::ABS, dl, Op.getValueType(), + Op.getOperand(1)); case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); @@ -2465,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + return LowerVECREDUCE(Op, DAG); } } @@ -2489,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: + if (Subtarget->isTargetWindows() && IsVarArg) + return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + case CallingConv::Win64: + return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; } } @@ -2507,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -2663,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { - if (!Subtarget->isTargetDarwin()) { + if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. + // Win64 variadic functions also pass arguments in registers, but all float + // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } @@ -2708,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); auto PtrVT = getPointerTy(DAG.getDataLayout()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); SmallVector<SDValue, 8> MemOps; @@ -2720,7 +2889,13 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + if (IsWin64) { + GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); + if (GPRSaveSize & 15) + // The extra size here, if triggered, will always be 8. + MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); + } else + GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -2729,7 +2904,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); + IsWin64 + ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + GPRIdx, + (i - FirstVariadicGPR) * 8) + : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2738,7 +2917,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; @@ -3108,9 +3287,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, - true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); @@ -3245,30 +3422,26 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - if (getTargetMachine().getCodeModel() == CodeModel::Large && - Subtarget->isTargetMachO()) { - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + auto GV = G->getGlobal(); + if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == + AArch64II::MO_GOT) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { const GlobalValue *GV = G->getGlobal(); - bool InternalLinkage = GV->hasInternalLinkage(); - if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - else { - Callee = - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); - Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); - } - } else if (ExternalSymbolSDNode *S = - dyn_cast<ExternalSymbolSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); + } + } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } else { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } // We don't usually want to end the call-sequence here because we would tidy @@ -3428,11 +3601,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Other Lowering Code //===----------------------------------------------------------------------===// +SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), + N->getOffset(), Flag); +} + +SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, + SelectionDAG &DAG, + unsigned Flag) const { + return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); +} + +// (loadGOT sym) +template <class NodeTy> +SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); +} + +// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG) + const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, Ty, + getTargetNode(N, Ty, DAG, AArch64II::MO_G3), + getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC), + getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC)); +} + +// (addlow (adrp %hi(sym)) %lo(sym)) +template <class NodeTy> +SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const { + DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); + SDLoc DL(N); + EVT Ty = getPointerTy(DAG.getDataLayout()); + SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE); + SDValue Lo = getTargetNode(N, Ty, DAG, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); +} + SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); - const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); + GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -3440,32 +3677,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && "unexpected offset in global node"); - // This also catched the large code model case for Darwin. + // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { - SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes instead of using a wrapper node. - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(GN, DAG); } if (getTargetMachine().getCodeModel() == CodeModel::Large) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(GN, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and - // the only correct model on Darwin. - SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, - OpFlags | AArch64II::MO_PAGE); - unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; - SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(GN, DAG); } } @@ -3578,7 +3798,7 @@ SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); - assert(getTargetMachine().getCodeModel() == CodeModel::Small && + assert(Subtarget->useSmallAddressing() && "ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the @@ -3679,7 +3899,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); - else if (Subtarget->isTargetELF()) + if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); @@ -4242,90 +4462,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(JT, DAG); } - - SDValue Hi = - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { - SDValue GotAddr = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_GOT); - return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + return getGOT(CP, DAG); } - - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G3), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G2 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G1 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(CP, DAG); } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on - // ELF, the only valid one on Darwin. - SDValue Hi = - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc DL(Op); + BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = AArch64II::MO_NC; - return DAG.getNode( - AArch64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + return getAddrLarge(BA, DAG); } else { - SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); - SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); - SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + return getAddr(BA, DAG); } } @@ -4342,6 +4509,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, MachinePointerInfo(SV)); } +SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + + SDLoc DL(Op); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 + ? FuncInfo->getVarArgsGPRIndex() + : FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV)); +} + SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call @@ -4413,8 +4595,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { - return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) - : LowerAAPCS_VASTART(Op, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + return LowerWin64_VASTART(Op, DAG); + else if (Subtarget->isTargetDarwin()) + return LowerDarwin_VASTART(Op, DAG); + else + return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, @@ -4422,7 +4610,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + unsigned VaListSize = + Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); @@ -4516,7 +4705,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) .Case("sp", AArch64::SP) + .Case("x18", AArch64::X18) + .Case("w18", AArch64::W18) .Default(0); + if ((Reg == AArch64::X18 || Reg == AArch64::W18) && + !Subtarget->isX18Reserved()) + Reg = 0; if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" @@ -4717,9 +4911,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, - &Flags); - Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + Flags); + Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } if (!Reciprocal) { @@ -4728,7 +4922,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); // Correct the result if the operand is 0.0. Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Eq, Operand, Estimate); @@ -4757,8 +4951,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, // AArch64 reciprocal iteration instruction: (2 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, - Estimate, &Flags); - Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags); + Estimate, Flags); + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } ExtraSteps = 0; @@ -6591,21 +6785,20 @@ FailedModImm: if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); - unsigned ElemSize = VT.getScalarSizeInBits(); unsigned i = 0; - // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + + // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the - // value is already in an S or D register. - // Do not do this for UNDEF/LOAD nodes because we have better patterns - // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && - (ElemSize == 32 || ElemSize == 64)) { - unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; - MachineSDNode *N = - DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, dl, MVT::i32)); - Vec = SDValue(N, 0); + // value is already in an S or D register, and we're forced to emit an + // INSERT_SUBREG that we can't fold anywhere. + // + // We also allow types like i8 and i16 which are illegal scalar but legal + // vector element types. After type-legalization the inserted value is + // extended (i32) and it is safe to cast them to the vector type by ignoring + // the upper bits of the lowest lane (e.g. v8i8, v4i16). + if (!Op0.isUndef()) { + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } for (; i < NumElts; ++i) { @@ -6995,6 +7188,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, return Cmp; } +static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, + SelectionDAG &DAG) { + SDValue VecOp = ScalarOp.getOperand(0); + auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, + DAG.getConstant(0, DL, MVT::i64)); +} + +SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); + case ISD::VECREDUCE_SMAX: + return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); + case ISD::VECREDUCE_SMIN: + return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); + case ISD::VECREDUCE_UMAX: + return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); + case ISD::VECREDUCE_UMIN: + return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); + case ISD::VECREDUCE_FMAX: { + assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), + Op.getOperand(0)); + } + case ISD::VECREDUCE_FMIN: { + assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), + Op.getOperand(0)); + } + default: + llvm_unreachable("Unhandled reduction"); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -7132,7 +7366,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; - if (I->getNumUses() != 1) + if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); @@ -7249,6 +7483,41 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, return NumBits == 32 || NumBits == 64; } +/// A helper function for determining the number of interleaved accesses we +/// will generate when lowering accesses of the given type. +unsigned +AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const { + return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +} + +MachineMemOperand::Flags +AArch64TargetLowering::getMMOFlags(const Instruction &I) const { + if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && + I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) + return MOStridedAccess; + return MachineMemOperand::MONone; +} + +bool AArch64TargetLowering::isLegalInterleavedAccessType( + VectorType *VecTy, const DataLayout &DL) const { + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + + // Ensure the number of vector elements is greater than 1. + if (VecTy->getNumElements() < 2) + return false; + + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return false; + + // Ensure the total vector size is 64 or a multiple of 128. Types larger than + // 128 will be split into multiple interleaved accesses. + return VecSize == 64 || VecSize % 128 == 0; +} + /// \brief Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -7272,12 +7541,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; + unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); + // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); @@ -7285,6 +7557,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad( VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + IRBuilder<> Builder(LI); + + // The base address of the load. + Value *BaseAddr = LI->getPointerOperand(); + + if (NumLoads > 1) { + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + VecTy = VectorType::get(VecTy->getVectorElementType(), + VecTy->getVectorNumElements() / NumLoads); + + // We will compute the pointer operand of each load from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, VecTy->getVectorElementType()->getPointerTo( + LI->getPointerAddressSpace())); + } + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, @@ -7293,39 +7584,50 @@ bool AArch64TargetLowering::lowerInterleavedLoad( Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); - IRBuilder<> Builder(LI); - Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + // Holds sub-vectors extracted from the load intrinsic return values. The + // sub-vectors are associated with the shufflevector instructions they will + // replace. + DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; - CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { - // Replace uses of each shufflevector with the corresponding vector loaded - // by ldN. - for (unsigned i = 0; i < Shuffles.size(); i++) { - ShuffleVectorInst *SVI = Shuffles[i]; - unsigned Index = Indices[i]; + // If we're generating more than one load, compute the base address of + // subsequent loads as an offset from the previous. + if (LoadCount > 0) + BaseAddr = Builder.CreateConstGEP1_32( + BaseAddr, VecTy->getVectorNumElements() * Factor); - Value *SubVec = Builder.CreateExtractValue(LdN, Index); + CallInst *LdN = Builder.CreateCall( + LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); - // Convert the integer vector to pointer vector if the element is pointer. - if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + // Extract and store the sub-vectors returned by the load intrinsic. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; - SVI->replaceAllUsesWith(SubVec); - } + Value *SubVec = Builder.CreateExtractValue(LdN, Index); - return true; -} + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr( + SubVec, VectorType::get(SVI->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); + SubVecs[SVI].push_back(SubVec); + } + } -/// \brief Get a mask consisting of sequential integers starting from \p Start. -/// -/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> -static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, - unsigned NumElts) { - SmallVector<Constant *, 16> Mask; - for (unsigned i = 0; i < NumElts; i++) - Mask.push_back(Builder.getInt32(Start + i)); + // Replace uses of the shufflevector instructions with the sub-vectors + // returned by the load intrinsic. If a shufflevector instruction is + // associated with more than one sub-vector, those sub-vectors will be + // concatenated into a single wide vector. + for (ShuffleVectorInst *SVI : Shuffles) { + auto &SubVec = SubVecs[SVI]; + auto *WideVec = + SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; + SVI->replaceAllUsesWith(WideVec); + } - return ConstantVector::get(Mask); + return true; } /// \brief Lower an interleaved store into a stN intrinsic. @@ -7369,12 +7671,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip if we do not have NEON and skip illegal vector types. - if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); @@ -7394,6 +7699,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, LaneLen); } + // The base address of the store. + Value *BaseAddr = SI->getPointerOperand(); + + if (NumStores > 1) { + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + + // We will compute the pointer operand of each store from the original base + // address using GEPs. Cast the base address to a pointer to the scalar + // element type. + BaseAddr = Builder.CreateBitCast( + BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( + SI->getPointerAddressSpace())); + } + + auto Mask = SVI->getShuffleMask(); + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, @@ -7402,34 +7726,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); - SmallVector<Value *, 5> Ops; + for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { - // Split the shufflevector operands into sub vectors for the new stN call. - auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) { - if (Mask[i] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen))); - } else { - unsigned StartMask = 0; - for (unsigned j = 1; j < LaneLen; j++) { - if (Mask[j*Factor + i] >= 0) { - StartMask = Mask[j*Factor + i] - j; - break; + SmallVector<Value *, 5> Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) { + unsigned IdxI = StoreCount * LaneLen * Factor + i; + if (Mask[IdxI] >= 0) { + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + } else { + unsigned StartMask = 0; + for (unsigned j = 1; j < LaneLen; j++) { + unsigned IdxJ = StoreCount * LaneLen * Factor + j; + if (Mask[IdxJ * Factor + IdxI] >= 0) { + StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; + break; + } } + // Note: Filling undef gaps with random elements is ok, since + // those elements were being written anyway (with undefs). + // In the case of all undefs we're defaulting to using elems from 0 + // Note: StartMask cannot be negative, it's checked in + // isReInterleaveMask + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } - // Note: If all elements in a chunk are undefs, StartMask=0! - // Note: Filling undef gaps with random elements is ok, since - // those elements were being written anyway (with undefs). - // In the case of all undefs we're defaulting to using elems from 0 - // Note: StartMask cannot be negative, it's checked in isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen))); } - } - Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); - Builder.CreateCall(StNFunc, Ops); + // If we generating more than one store, we compute the base address of + // subsequent stores as an offset from the previous. + if (StoreCount > 0) + BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } return true; } @@ -7690,7 +8023,7 @@ SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const { - AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes(); + AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV @@ -8079,9 +8412,9 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: -/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an -/// EXTR. Can't quite be done in TableGen because the two immediates aren't -/// independent. +/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it +/// with an EXTR. Can't quite be done in TableGen because the two immediates +/// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -8935,16 +9268,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); + uint64_t BaseOffset = 0; + const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); + // As this in ISel, we will not merge this add which may degrade results. + if (BasePtr->getOpcode() == ISD::ADD && + isa<ConstantSDNode>(BasePtr->getOperand(1))) { + BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue(); + BasePtr = BasePtr->getOperand(0); + } + unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, DL, MVT::i64)); + SDValue OffsetPtr = + DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); @@ -9072,7 +9415,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); StoreSDNode *S = cast<StoreSDNode>(N); - if (S->isVolatile()) + if (S->isVolatile() || S->isIndexed()) return SDValue(); SDValue StVal = S->getValue(); @@ -9236,17 +9579,17 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } -/// Simplify \Addr given that the top byte of it is ignored by HW during +/// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), - DCI.isBeforeLegalizeOps()); + KnownBits Known; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } @@ -9267,266 +9610,6 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } - /// This function handles the log2-shuffle pattern produced by the -/// LoopVectorizer for the across vector reduction. It consists of -/// log2(NumVectorElements) steps and, in each step, 2^(s) elements -/// are reduced, where s is an induction variable from 0 to -/// log2(NumVectorElements). -static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, - unsigned Op, - SelectionDAG &DAG) { - EVT VTy = OpV->getOperand(0).getValueType(); - if (!VTy.isVector()) - return SDValue(); - - int NumVecElts = VTy.getVectorNumElements(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (NumVecElts != 4) - return SDValue(); - } else { - if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) - return SDValue(); - } - - int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = OpV; - // Iterate over each step of the across vector reduction. - for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - SDValue CurOp = PreOp.getOperand(0); - SDValue Shuffle = PreOp.getOperand(1); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add and min/max instructions - // are commutative. - CurOp = PreOp.getOperand(1); - Shuffle = PreOp.getOperand(0); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - } - - // Check if the input vector is fed by the operator we want to handle, - // except the last step; the very first input vector is not necessarily - // the same operator we are handling. - if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) - return SDValue(); - - // Check if it forms one step of the across vector reduction. - // E.g., - // %cur = add %1, %0 - // %shuffle = vector_shuffle %cur, <2, 3, u, u> - // %pre = add %cur, %shuffle - if (Shuffle.getOperand(0) != CurOp) - return SDValue(); - - int NumMaskElts = 1 << CurStep; - ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); - // Check mask values in each step. - // We expect the shuffle mask in each step follows a specific pattern - // denoted here by the <M, U> form, where M is a sequence of integers - // starting from NumMaskElts, increasing by 1, and the number integers - // in M should be NumMaskElts. U is a sequence of UNDEFs and the number - // of undef in U should be NumVecElts - NumMaskElts. - // E.g., for <8 x i16>, mask values in each step should be : - // step 0 : <1,u,u,u,u,u,u,u> - // step 1 : <2,3,u,u,u,u,u,u> - // step 2 : <4,5,6,7,u,u,u,u> - for (int i = 0; i < NumVecElts; ++i) - if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || - (i >= NumMaskElts && !(Mask[i] < 0))) - return SDValue(); - - PreOp = CurOp; - } - unsigned Opcode; - bool IsIntrinsic = false; - - switch (Op) { - default: - llvm_unreachable("Unexpected operator for across vector reduction"); - case ISD::ADD: - Opcode = AArch64ISD::UADDV; - break; - case ISD::SMAX: - Opcode = AArch64ISD::SMAXV; - break; - case ISD::UMAX: - Opcode = AArch64ISD::UMAXV; - break; - case ISD::SMIN: - Opcode = AArch64ISD::SMINV; - break; - case ISD::UMIN: - Opcode = AArch64ISD::UMINV; - break; - case ISD::FMAXNUM: - Opcode = Intrinsic::aarch64_neon_fmaxnmv; - IsIntrinsic = true; - break; - case ISD::FMINNUM: - Opcode = Intrinsic::aarch64_neon_fminnmv; - IsIntrinsic = true; - break; - } - SDLoc DL(N); - - return IsIntrinsic - ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), - DAG.getConstant(Opcode, DL, MVT::i32), PreOp) - : DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); -} - -/// Target-specific DAG combine for the across vector min/max reductions. -/// This function specifically handles the final clean-up step of the vector -/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which narrows down and finds the final min/max value from all -/// elements of the vector. -/// For example, for a <16 x i8> vector : -/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> -/// %smax0 = smax %arr, svn0 -/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax1 = smax %smax0, %svn1 -/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax2 = smax %smax1, svn2 -/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %sc = setcc %smax2, %svn3, gt -/// %n0 = extract_vector_elt %sc, #0 -/// %n1 = extract_vector_elt %smax2, #0 -/// %n2 = extract_vector_elt $smax2, #1 -/// %result = select %n0, %n1, n2 -/// becomes : -/// %1 = smaxv %0 -/// %result = extract_vector_elt %1, 0 -static SDValue -performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue IfTrue = N->getOperand(1); - SDValue IfFalse = N->getOperand(2); - - // Check if the SELECT merges up the final result of the min/max - // from a vector. - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Expect N0 is fed by SETCC. - SDValue SetCC = N0.getOperand(0); - EVT SetCCVT = SetCC.getValueType(); - if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || - SetCCVT.getVectorElementType() != MVT::i1) - return SDValue(); - - SDValue VectorOp = SetCC.getOperand(0); - unsigned Op = VectorOp->getOpcode(); - // Check if the input vector is fed by the operator we want to handle. - if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && - Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) - return SDValue(); - - EVT VTy = VectorOp.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (EltTy != MVT::f32) - return SDValue(); - } else { - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - } - - // Check if extracting from the same vector. - // For example, - // %sc = setcc %vector, %svn1, gt - // %n0 = extract_vector_elt %sc, #0 - // %n1 = extract_vector_elt %vector, #0 - // %n2 = extract_vector_elt $vector, #1 - if (!(VectorOp == IfTrue->getOperand(0) && - VectorOp == IfFalse->getOperand(0))) - return SDValue(); - - // Check if the condition code is matched with the operator type. - ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); - if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || - (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || - (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || - (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || - (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && - CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && - CC != ISD::SETGE) || - (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && - CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && - CC != ISD::SETLE)) - return SDValue(); - - // Expect to check only lane 0 from the vector SETCC. - if (!isNullConstant(N0.getOperand(1))) - return SDValue(); - - // Expect to extract the true value from lane 0. - if (!isNullConstant(IfTrue.getOperand(1))) - return SDValue(); - - // Expect to extract the false value from lane 1. - if (!isOneConstant(IfFalse.getOperand(1))) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); -} - -/// Target-specific DAG combine for the across vector add reduction. -/// This function specifically handles the final clean-up step of the vector -/// add reduction produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which adds all elements of a vector together. -/// For example, for a <4 x i32> vector : -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %result = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %result = extract_vector_elt %0, 0 -static SDValue -performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // Check if the input vector is fed by the ADD. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isNullConstant(N1)) - return SDValue(); - - EVT VTy = N0.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); -} /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. @@ -10205,12 +10288,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: { - SDValue RV = performSelectCombine(N, DCI); - if (!RV.getNode()) - RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); - return RV; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: @@ -10232,8 +10311,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); - case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -10307,7 +10384,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. -bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { +bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } @@ -10453,6 +10530,14 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); + return; + case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; @@ -10483,9 +10568,9 @@ void AArch64TargetLowering::ReplaceNodeResults( } bool AArch64TargetLowering::useLoadStackGuardNode() const { - if (!Subtarget->isTargetAndroid()) - return true; - return TargetLowering::useLoadStackGuardNode(); + if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) + return TargetLowering::useLoadStackGuardNode(); + return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { @@ -10527,11 +10612,17 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; + if (Size > 128) return AtomicExpansionKind::None; + // Nand not supported in LSE. + if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; + // Leave 128 bits to LLSC. + return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { + // If subtarget has LSE, leave cmpxchg intact for codegen. + if (Subtarget->hasLSE()) return false; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a @@ -10623,36 +10714,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, return false; } -Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getIRStackGuard(IRB); - - // Android provides a fixed TLS slot for the stack cookie. See the definition - // of TLS_SLOT_STACK_GUARD in - // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x28; +static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } -Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { - if (!Subtarget->isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); +Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // Android provides a fixed TLS slot for the stack cookie. See the definition + // of TLS_SLOT_STACK_GUARD in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x28); + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x10); + + return TargetLowering::getIRStackGuard(IRB); +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - const unsigned TlsOffset = 0x48; - Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); - return IRB.CreatePointerCast( - IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); + if (Subtarget->isTargetAndroid()) + return UseTlsOffset(IRB, 0x48); + + // Fuchsia is similar. + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + if (Subtarget->isTargetFuchsia()) + return UseTlsOffset(IRB, -0x8); + + return TargetLowering::getSafeStackPointerLocation(IRB); +} + +bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + // Only sink 'and' mask to cmp use block if it is masking a single bit, since + // this is likely to be fold the and/cmp/br into a single tbz instruction. It + // may be beneficial to sink in other cases, but we would have to check that + // the cmp would not get folded into the br to form a cbz for these to be + // beneficial. + ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); + if (!Mask) + return false; + return Mask->getUniqueInteger().isPowerOf2(); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { @@ -10702,7 +10813,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( } } -bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. @@ -10711,6 +10822,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = - Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } + +unsigned +AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) + return getPointerTy(DL).getSizeInBits(); + + return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 054ccc3..3b0e0f1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -250,10 +250,14 @@ public: /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. - void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, - APInt &KnownOne, const SelectionDAG &DAG, + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the @@ -402,7 +406,20 @@ public: return AArch64::X1; } - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; + + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const override { + // Do not merge to float value size (128 bytes) if no implicit + // float attribute is set. + + bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat); + + if (NoFloat) + return (MemVT.getSizeInBits() <= 64); + return true; + } bool isCheapToSpeculateCttz() const override { return true; @@ -412,6 +429,8 @@ public: return true; } + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + bool hasAndNotCompare(SDValue) const override { // 'bics' return true; @@ -435,6 +454,22 @@ public: return true; } + /// Returns the size of the platform's va_list object. + unsigned getVaListSizeInBits(const DataLayout &DL) const override; + + /// Returns true if \p VecTy is a legal interleaved access type. This + /// function checks the vector element type and the overall width of the + /// vector. + bool isLegalInterleavedAccessType(VectorType *VecTy, + const DataLayout &DL) const; + + /// Returns the number of interleaved accesses that will be generated when + /// lowering accesses of the given type. + unsigned getNumInterleavedAccesses(VectorType *VecTy, + const DataLayout &DL) const; + + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -491,6 +526,18 @@ private: const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, + unsigned Flag) const; + template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const; + template <class NodeTy> + SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const; + template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; @@ -509,6 +556,7 @@ private: SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; @@ -536,6 +584,7 @@ private: SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; @@ -576,7 +625,7 @@ private: } bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 867074c..eec41dd 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -14,6 +14,9 @@ //===---------------------------------- // Atomic fences //===---------------------------------- +let AddedComplexity = 15, Size = 0 in +def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering), + [(atomic_fence imm:$ordering, 0)]>, Sched<[]>; def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; @@ -402,3 +405,59 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch), (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, GPR64:$newLo, GPR64:$newHi), []>, Sched<[WriteAtomic]>; + +// v8.1 Atomic instructions: +def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>; +def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>; + +def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; +def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; + +def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index cefdf51..c44daf3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction { let Constraints = cstr; } +class InstSubst<string Asm, dag Result, bit EmitPriority = 0> + : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>; + // Pseudo instructions (don't have encoding information) class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = ""> : AArch64Inst<PseudoFrm, cstr> { @@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; class AsmImmRange<int Low, int High> : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; + let PredicateMethod = "isImmInRange<" # Low # "," # High # ">"; } def Imm1_8Operand : AsmImmRange<1, 8>; @@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>; def Imm1_32Operand : AsmImmRange<1, 32>; def Imm1_64Operand : AsmImmRange<1, 64>; +class BranchTarget<int N> : AsmOperandClass { + let Name = "BranchTarget" # N; + let DiagnosticType = "InvalidLabel"; + let PredicateMethod = "isBranchTarget<" # N # ">"; +} + +class PCRelLabel<int N> : BranchTarget<N> { + let Name = "PCRelLabel" # N; +} + +def BranchTarget14Operand : BranchTarget<14>; +def BranchTarget26Operand : BranchTarget<26>; +def PCRelLabel19Operand : PCRelLabel<19>; + def MovZSymbolG3AsmOperand : AsmOperandClass { let Name = "MovZSymbolG3"; let RenderMethod = "addImmOperands"; @@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ } // imm0_255 predicate - True if the immediate is in the range [0,255]. -def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; } +def Imm0_255Operand : AsmImmRange<0,255>; + def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 256; }]> { @@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>; def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>; def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>; +def gi_addsub_shifted_imm32 : + GIComplexOperandMatcher<s32, "selectArithImmed">, + GIComplexPatternEquiv<addsub_shifted_imm32>; + +def gi_addsub_shifted_imm64 : + GIComplexOperandMatcher<s64, "selectArithImmed">, + GIComplexPatternEquiv<addsub_shifted_imm64>; + class neg_addsub_shifted_imm<ValueType Ty> : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> { let PrintMethod = "printAddSubImm"; @@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> { // Conditional branch target. 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. -def PCRelLabel19Operand : AsmOperandClass { - let Name = "PCRelLabel19"; - let DiagnosticType = "InvalidLabel"; -} def am_brcond : Operand<OtherVT> { let EncoderMethod = "getCondBranchTargetOpValue"; let DecoderMethod = "DecodePCRelLabel19"; @@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> { //--- // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of // the target offset are implied zero and so are not part of the immediate. -def BranchTarget14Operand : AsmOperandClass { - let Name = "BranchTarget14"; -} def am_tbrcond : Operand<OtherVT> { let EncoderMethod = "getTestBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; @@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> { // AsmOperand classes to emit (or not) special diagnostics def TBZImm0_31Operand : AsmOperandClass { let Name = "TBZImm0_31"; - let PredicateMethod = "isImm0_31"; + let PredicateMethod = "isImmInRange<0,31>"; let RenderMethod = "addImm0_31Operands"; } def TBZImm32_63Operand : AsmOperandClass { let Name = "Imm32_63"; + let PredicateMethod = "isImmInRange<32,63>"; let DiagnosticType = "InvalidImm0_63"; } @@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> { //--- // Unconditional branch (immediate) instructions. //--- -def BranchTarget26Operand : AsmOperandClass { - let Name = "BranchTarget26"; - let DiagnosticType = "InvalidLabel"; -} def am_b_target : Operand<OtherVT> { let EncoderMethod = "getBranchTargetOpValue"; let PrintMethod = "printAlignedLabel"; @@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, } // add Rd, Rb, -imm -> sub Rd, Rn, imm - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#"\t$Rd, $Rn, $imm", + def : InstSubst<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm - def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands @@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, let Inst{31} = 1; } - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, } } // end Defs = [NZCV] - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + def : InstSubst<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat> // Load literal address: 19-bit immediate. The low two bits of the target // offset are implied zero and so are not part of the immediate. -def am_ldrlit : Operand<OtherVT> { +def am_ldrlit : Operand<iPTR> { let EncoderMethod = "getLoadLiteralOpValue"; let DecoderMethod = "DecodePCRelLabel19"; let PrintMethod = "printAlignedLabel"; @@ -9060,7 +9077,7 @@ multiclass SIMDLdSt4SingleAliases<string asm> { // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract //---------------------------------------------------------------------------- -let Predicates = [HasNEON, HasV8_1a] in { +let Predicates = [HasNEON, HasRDM] in { class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand regtype, string asm, @@ -9221,7 +9238,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, let Inst{21} = idx{0}; } } -} // let Predicates = [HasNeon, HasV8_1a] +} // let Predicates = [HasNeon, HasRDM] //---------------------------------------------------------------------------- // Crypto extensions diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 4c78992..c0c6055 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,12 +12,13 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -51,9 +52,6 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" -static const MachineMemOperand::Flags MOSuppressPair = - MachineMemOperand::MOTargetFlag1; - static cl::opt<unsigned> TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); @@ -369,7 +367,7 @@ void AArch64InstrInfo::instantiateCondBranch( // Folded compare-and-branch // Note that we use addOperand instead of addReg to keep the flags. const MachineInstrBuilder MIB = - BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]); + BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); if (Cond.size() > 3) MIB.addImm(Cond[3].getImm()); MIB.addMBB(TBB); @@ -762,6 +760,128 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { llvm_unreachable("Unknown opcode to check as cheap as a move!"); } +bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + return false; + + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + if (ShiftVal == 0) + return true; + return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; + } + + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) <= 4; + } + } + + case AArch64::SUBWrs: + case AArch64::SUBSWrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); + } + + case AArch64::SUBXrs: + case AArch64::SUBSXrs: { + unsigned Imm = MI.getOperand(3).getImm(); + unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); + return ShiftVal == 0 || + (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); + } + + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: { + unsigned Imm = MI.getOperand(3).getImm(); + switch (AArch64_AM::getArithExtendType(Imm)) { + default: + return false; + case AArch64_AM::UXTB: + case AArch64_AM::UXTH: + case AArch64_AM::UXTW: + case AArch64_AM::UXTX: + return AArch64_AM::getArithShiftValue(Imm) == 0; + } + } + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::STRBBroW: + case AArch64::STRBBroX: + case AArch64::STRBroW: + case AArch64::STRBroX: + case AArch64::STRDroW: + case AArch64::STRDroX: + case AArch64::STRHHroW: + case AArch64::STRHHroX: + case AArch64::STRHroW: + case AArch64::STRHroX: + case AArch64::STRQroW: + case AArch64::STRQroX: + case AArch64::STRSroW: + case AArch64::STRSroX: + case AArch64::STRWroW: + case AArch64::STRWroX: + case AArch64::STRXroW: + case AArch64::STRXroX: { + unsigned IsSigned = MI.getOperand(3).getImm(); + return !IsSigned; + } + } +} + bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const { @@ -913,7 +1033,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) { /// \brief Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. -static unsigned convertFlagSettingOpcode(const MachineInstr &MI) { +static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; @@ -1022,7 +1142,7 @@ bool AArch64InstrInfo::optimizeCompareInstr( return true; } unsigned Opc = CmpInstr.getOpcode(); - unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); + unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); @@ -1159,6 +1279,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { case AArch64CC::HI: // Z clear and C set case AArch64CC::LS: // Z set or C clear UsedFlags.Z = true; + LLVM_FALLTHROUGH; case AArch64CC::HS: // C set case AArch64CC::LO: // C clear UsedFlags.C = true; @@ -1177,6 +1298,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { case AArch64CC::GT: // Z clear, N and V the same case AArch64CC::LE: // Z set, N and V differ UsedFlags.Z = true; + LLVM_FALLTHROUGH; case AArch64CC::GE: // N and V the same case AArch64CC::LT: // N and V differ UsedFlags.N = true; @@ -1299,16 +1421,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) - .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); + .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); + .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); + .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) .addReg(Reg, RegState::Kill) - .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); + .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addImm(0) @@ -1345,14 +1467,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const { case AArch64::BICSXrs: case AArch64::BICWrs: case AArch64::BICXrs: - case AArch64::CRC32Brr: - case AArch64::CRC32CBrr: - case AArch64::CRC32CHrr: - case AArch64::CRC32CWrr: - case AArch64::CRC32CXrr: - case AArch64::CRC32Hrr: - case AArch64::CRC32Wrr: - case AArch64::CRC32Xrr: case AArch64::EONWrs: case AArch64::EONXrs: case AArch64::EORWrs: @@ -1598,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { (*MI.memoperands_begin())->setFlags(MOSuppressPair); } +/// Check all MachineMemOperands for a hint that the load/store is strided. +bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const { + return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOStridedAccess; + }); +} + bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { switch (Opc) { default: @@ -1691,16 +1812,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( } else return false; - // Offset is calculated as the immediate operand multiplied by the scaling factor. - // Unscaled instructions have scaling factor set to 1. + // Get the scaling factor for the instruction and set the width for the + // instruction. unsigned Scale = 0; - switch (LdSt.getOpcode()) { + int64_t Dummy1, Dummy2; + + // If this returns false, then it's an instruction we don't want to handle. + if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) + return false; + + // Compute the offset. Offset is calculated as the immediate operand + // multiplied by the scaling factor. Unscaled instructions have scaling factor + // set to 1. + if (LdSt.getNumExplicitOperands() == 3) { + BaseReg = LdSt.getOperand(1).getReg(); + Offset = LdSt.getOperand(2).getImm() * Scale; + } else { + assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); + BaseReg = LdSt.getOperand(2).getReg(); + Offset = LdSt.getOperand(3).getImm() * Scale; + } + return true; +} + +MachineOperand& +AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); + MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1); + assert(OfsOp.isImm() && "Offset operand wasn't immediate."); + return OfsOp; +} + +bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, + unsigned &Width, int64_t &MinOffset, + int64_t &MaxOffset) const { + switch (Opcode) { + // Not a memory operation or something we want to handle. default: + Scale = Width = 0; + MinOffset = MaxOffset = 0; return false; + case AArch64::STRWpost: + case AArch64::LDRWpost: + Width = 32; + Scale = 4; + MinOffset = -256; + MaxOffset = 255; + break; case AArch64::LDURQi: case AArch64::STURQi: Width = 16; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURXi: case AArch64::LDURDi: @@ -1708,6 +1872,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURDi: Width = 8; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURWi: case AArch64::LDURSi: @@ -1716,6 +1882,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURSi: Width = 4; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURHi: case AArch64::LDURHHi: @@ -1725,6 +1893,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURHHi: Width = 2; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDURBi: case AArch64::LDURBBi: @@ -1734,6 +1904,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STURBBi: Width = 1; Scale = 1; + MinOffset = -256; + MaxOffset = 255; break; case AArch64::LDPQi: case AArch64::LDNPQi: @@ -1741,10 +1913,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPQi: Scale = 16; Width = 32; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDPXi: case AArch64::LDPDi: @@ -1756,12 +1932,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPDi: Scale = 8; Width = 16; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDPWi: case AArch64::LDPSi: @@ -1773,6 +1953,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STNPSi: Scale = 4; Width = 8; + MinOffset = -64; + MaxOffset = 63; break; case AArch64::LDRWui: case AArch64::LDRSui: @@ -1780,29 +1962,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDRHui: case AArch64::LDRHHui: case AArch64::STRHui: case AArch64::STRHHui: Scale = Width = 2; + MinOffset = 0; + MaxOffset = 4095; break; case AArch64::LDRBui: case AArch64::LDRBBui: case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; + MinOffset = 0; + MaxOffset = 4095; break; } - if (LdSt.getNumExplicitOperands() == 3) { - BaseReg = LdSt.getOperand(1).getReg(); - Offset = LdSt.getOperand(2).getImm() * Scale; - } else { - assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); - BaseReg = LdSt.getOperand(2).getReg(); - Offset = LdSt.getOperand(3).getImm() * Scale; - } return true; } @@ -1903,88 +2083,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent( - const MachineInstr &First, const MachineInstr &Second) const { - if (Subtarget.hasArithmeticBccFusion()) { - // Fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::Bcc) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDSWri: - case AArch64::ADDSWrr: - case AArch64::ADDSXri: - case AArch64::ADDSXrr: - case AArch64::ANDSWri: - case AArch64::ANDSWrr: - case AArch64::ANDSXri: - case AArch64::ANDSXrr: - case AArch64::SUBSWri: - case AArch64::SUBSWrr: - case AArch64::SUBSXri: - case AArch64::SUBSXrr: - case AArch64::BICSWrr: - case AArch64::BICSXrr: - return true; - case AArch64::ADDSWrs: - case AArch64::ADDSXrs: - case AArch64::ANDSWrs: - case AArch64::ANDSXrs: - case AArch64::SUBSWrs: - case AArch64::SUBSXrs: - case AArch64::BICSWrs: - case AArch64::BICSXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - if (Subtarget.hasArithmeticCbzFusion()) { - // Fuse ALU operations followed by CBZ/CBNZ. - unsigned SecondOpcode = Second.getOpcode(); - if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || - SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First.getOpcode()) { - default: - return false; - case AArch64::ADDWri: - case AArch64::ADDWrr: - case AArch64::ADDXri: - case AArch64::ADDXrr: - case AArch64::ANDWri: - case AArch64::ANDWrr: - case AArch64::ANDXri: - case AArch64::ANDXrr: - case AArch64::EORWri: - case AArch64::EORWrr: - case AArch64::EORXri: - case AArch64::EORXrr: - case AArch64::ORRWri: - case AArch64::ORRWrr: - case AArch64::ORRXri: - case AArch64::ORRXrr: - case AArch64::SUBWri: - case AArch64::SUBWrr: - case AArch64::SUBXri: - case AArch64::SUBXrr: - return true; - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - // Shift value can be 0 making these behave like the "rr" variant... - return !hasShiftedReg(Second); - } - } - } - return false; -} - MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, const DebugLoc &DL) const { @@ -2339,7 +2437,7 @@ void AArch64InstrInfo::storeRegToStackSlot( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; @@ -2443,7 +2541,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( unsigned Opc = 0; bool Offset = true; - switch (RC->getSize()) { + switch (TRI->getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; @@ -2668,7 +2766,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( }; if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { - assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == + TRI.getRegSizeInBits(*getRegClass(SrcReg)) && "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, @@ -2754,7 +2853,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( } if (FillRC) { - assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() && + assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == + TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); MachineInstr &LoadMI = *--InsertPt; @@ -3044,7 +3144,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, return false; } -void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { +void AArch64InstrInfo::getNoop(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } @@ -3224,7 +3324,7 @@ static bool getMaddPatterns(MachineInstr &Root, // When NZCV is live bail out. if (Cmp_NZCV == -1) return false; - unsigned NewOpc = convertFlagSettingOpcode(Root); + unsigned NewOpc = convertToNonFlagSettingOpc(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) @@ -3444,6 +3544,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); + Found = true; + } break; case AArch64::FSUBDrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { @@ -3458,6 +3562,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); + Found = true; + } break; case AArch64::FSUBv2f32: if (canCombineWithFMUL(MBB, Root.getOperand(2), @@ -3512,6 +3620,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: @@ -3565,12 +3675,17 @@ enum class FMAInstKind { Default, Indexed, Accumulator }; /// F|MUL I=A,B,0 /// F|ADD R,I,C /// ==> F|MADD R,A,B,C +/// \param MF Containing MachineFunction +/// \param MRI Register information +/// \param TII Target information /// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of /// the F|MUL. In the example above IdxMulOpd is 1. /// \param MaddOpc the opcode fo the f|madd instruction +/// \param RC Register class of operands +/// \param kind of fma instruction (addressing mode) to be generated static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, @@ -3629,6 +3744,9 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, /// ADD R,I,Imm /// ==> ORR V, ZR, Imm /// ==> MADD R,A,B,V +/// \param MF Containing MachineFunction +/// \param MRI Register information +/// \param TII Target information /// \param Root is the ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction @@ -3637,6 +3755,7 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, /// \param MaddOpc the opcode fo the madd instruction /// \param VR is a virtual register that holds the value of an ADD operand /// (V in the example above). +/// \param RC Register class of operands static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, @@ -3793,7 +3912,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MachineInstrBuilder MIB1 = BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) .addReg(ZeroReg) - .addOperand(Root.getOperand(2)); + .add(Root.getOperand(2)); InsInstrs.push_back(MIB1); InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); @@ -4013,6 +4132,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; } + + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: { + // FNMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMADD R,A,B,C // = -A*B - C + // --- Create(FNMADD); + if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { + Opc = AArch64::FNMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: case MachineCombinerPattern::FMULSUBD_OP2: { // FMUL I=A,B,0 @@ -4028,6 +4165,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; + } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; @@ -4084,7 +4222,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Accumulator); } break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); @@ -4094,26 +4231,36 @@ void AArch64InstrInfo::genAlternativeCodeSequence( /// \brief Replace csincr-branch sequence by simple conditional branch /// /// Examples: -/// 1. +/// 1. \code /// csinc w9, wzr, wzr, <condition code> /// tbnz w9, #0, 0x44 +/// \endcode /// to +/// \code /// b.<inverted condition code> +/// \endcode /// -/// 2. +/// 2. \code /// csinc w9, wzr, wzr, <condition code> /// tbz w9, #0, 0x44 +/// \endcode /// to +/// \code /// b.<condition code> +/// \endcode /// /// Replace compare and branch sequence by TBZ/TBNZ instruction when the /// compare's constant operand is power of 2. /// /// Examples: +/// \code /// and w8, w8, #0x400 /// cbnz w8, L1 +/// \endcode /// to +/// \code /// tbnz w8, #10, L1 +/// \endcode /// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated @@ -4286,3 +4433,207 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } + +ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> +AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { + static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = + {{MOSuppressPair, "aarch64-suppress-pair"}, + {MOStridedAccess, "aarch64-strided-access"}}; + return makeArrayRef(TargetFlags); +} + +unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize, + size_t Occurrences, + bool CanBeTailCall) const { + unsigned NotOutlinedSize = SequenceSize * Occurrences; + unsigned OutlinedSize; + + // Is this candidate something we can outline as a tail call? + if (CanBeTailCall) { + // If yes, then we just outline the sequence and replace each of its + // occurrences with a branch instruction. + OutlinedSize = SequenceSize + Occurrences; + } else { + // If no, then we outline the sequence (SequenceSize), add a return (+1), + // and replace each occurrence with a save/restore to LR and a call + // (3 * Occurrences) + OutlinedSize = (SequenceSize + 1) + (3 * Occurrences); + } + + // Return the number of instructions saved by outlining this sequence. + return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0; +} + +bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const { + return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); +} + +AArch64GenInstrInfo::MachineOutlinerInstrType +AArch64InstrInfo::getOutliningType(MachineInstr &MI) const { + + MachineFunction *MF = MI.getParent()->getParent(); + AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + + // Don't outline LOHs. + if (FuncInfo->getLOHRelated().count(&MI)) + return MachineOutlinerInstrType::Illegal; + + // Don't allow debug values to impact outlining type. + if (MI.isDebugValue() || MI.isIndirectDebugValue()) + return MachineOutlinerInstrType::Invisible; + + // Is this a terminator for a basic block? + if (MI.isTerminator()) { + + // Is this the end of a function? + if (MI.getParent()->succ_empty()) + return MachineOutlinerInstrType::Legal; + + // It's not, so don't outline it. + return MachineOutlinerInstrType::Illegal; + } + + // Don't outline positions. + if (MI.isPosition()) + return MachineOutlinerInstrType::Illegal; + + // Make sure none of the operands are un-outlinable. + for (const MachineOperand &MOP : MI.operands()) + if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || + MOP.isTargetIndex()) + return MachineOutlinerInstrType::Illegal; + + // Don't outline anything that uses the link register. + if (MI.modifiesRegister(AArch64::LR, &RI) || + MI.readsRegister(AArch64::LR, &RI)) + return MachineOutlinerInstrType::Illegal; + + // Does this use the stack? + if (MI.modifiesRegister(AArch64::SP, &RI) || + MI.readsRegister(AArch64::SP, &RI)) { + + // Is it a memory operation? + if (MI.mayLoadOrStore()) { + unsigned Base; // Filled with the base regiser of MI. + int64_t Offset; // Filled with the offset of MI. + unsigned DummyWidth; + + // Does it allow us to offset the base register and is the base SP? + if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) || + Base != AArch64::SP) + return MachineOutlinerInstrType::Illegal; + + // Find the minimum/maximum offset for this instruction and check if + // fixing it up would be in range. + int64_t MinOffset, MaxOffset; + unsigned DummyScale; + getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset, + MaxOffset); + + // TODO: We should really test what happens if an instruction overflows. + // This is tricky to test with IR tests, but when the outliner is moved + // to a MIR test, it really ought to be checked. + if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset) + return MachineOutlinerInstrType::Illegal; + + // It's in range, so we can outline it. + return MachineOutlinerInstrType::Legal; + } + + // We can't fix it up, so don't outline it. + return MachineOutlinerInstrType::Illegal; + } + + return MachineOutlinerInstrType::Legal; +} + +void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { + for (MachineInstr &MI : MBB) { + unsigned Base, Width; + int64_t Offset; + + // Is this a load or store with an immediate offset with SP as the base? + if (!MI.mayLoadOrStore() || + !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) || + Base != AArch64::SP) + continue; + + // It is, so we have to fix it up. + unsigned Scale; + int64_t Dummy1, Dummy2; + + MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); + assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); + getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); + assert(Scale != 0 && "Unexpected opcode!"); + + // We've pushed the return address to the stack, so add 16 to the offset. + // This is safe, since we already checked if it would overflow when we + // checked if this instruction was legal to outline. + int64_t NewImm = (Offset + 16)/Scale; + StackOffsetOperand.setImm(NewImm); + } +} + +void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const { + + // If this is a tail call outlined function, then there's already a return. + if (IsTailCall) + return; + + // It's not a tail call, so we have to insert the return ourselves. + MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) + .addReg(AArch64::LR, RegState::Undef); + MBB.insert(MBB.end(), ret); + + // Walk over the basic block and fix up all the stack accesses. + fixupPostOutline(MBB); +} + +void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const {} + +MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( + Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, + MachineFunction &MF, bool IsTailCall) const { + + // Are we tail calling? + if (IsTailCall) { + // If yes, then we can just branch to the label. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(AArch64::B)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + return It; + } + + // We're not tail calling, so we have to save LR before the call and restore + // it after. + MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(-16); + It = MBB.insert(It, STRXpre); + It++; + + // Insert the call. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(AArch64::BL)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + + It++; + + // Restore the link register. + MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) + .addReg(AArch64::SP, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP) + .addImm(16); + It = MBB.insert(It, LDRXpost); + + return It; +} + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 5037866..1765a02 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -27,6 +27,13 @@ namespace llvm { class AArch64Subtarget; class AArch64TargetMachine; +static const MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; +static const MachineMemOperand::Flags MOStridedAccess = + MachineMemOperand::MOTargetFlag2; + +#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access" + class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -81,6 +88,9 @@ public: /// unprofitable. bool isLdStPairSuppressed(const MachineInstr &MI) const; + /// Return true if the given load or store is a strided memory access. + bool isStridedAccess(const MachineInstr &MI) const; + /// Return true if this is an unscaled load/store. bool isUnscaledLdSt(unsigned Opc) const; @@ -119,6 +129,44 @@ public: } } + /// \brief Return the opcode that set flags when possible. The caller is + /// responsible for ensuring the opc has a flag setting equivalent. + static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no flag setting equivalent!"); + // 32-bit cases: + case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri; + case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr; + case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs; + case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx; + case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri; + case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr; + case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs; + case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr; + case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs; + case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri; + case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr; + case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs; + case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx; + // 64-bit cases: + case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri; + case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr; + case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs; + case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx; + case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri; + case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr; + case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs; + case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr; + case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs; + case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri; + case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr; + case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs; + case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx; + } + } + + /// Return true if this is a load/store that can be potentially paired/merged. bool isCandidateToMergeOrPair(MachineInstr &MI) const; @@ -133,12 +181,19 @@ public: int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; + /// Return the immediate offset of the base register in a load/store \p LdSt. + MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const; + + /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set + /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly. + /// + /// For unscaled instructions, \p Scale is set to 1. + bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width, + int64_t &MinOffset, int64_t &MaxOffset) const; + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, const MDNode *Expr, @@ -198,7 +253,7 @@ public: const DebugLoc &DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; + void getNoop(MCInst &NopInst) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. @@ -218,8 +273,8 @@ public: /// \param Pattern - combiner pattern bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence - /// for an instruction chain ending in <Root>. All potential patterns are - /// listed in the <Patterns> array. + /// for an instruction chain ending in ``Root``. All potential patterns are + /// listed in the ``Patterns`` array. bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; @@ -244,8 +299,36 @@ public: getSerializableDirectMachineOperandTargetFlags() const override; ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; - + ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> + getSerializableMachineMemOperandTargetFlags() const override; + + bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; + unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences, + bool CanBeTailCall) const override; + AArch64GenInstrInfo::MachineOutlinerInstrType + getOutliningType(MachineInstr &MI) const override; + void insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const override; + void insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool isTailCall) const override; + MachineBasicBlock::iterator + insertOutlinedCall(Module &M, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &It, + MachineFunction &MF, + bool IsTailCall) const override; + /// Returns true if the instruction has a shift by immediate that can be + /// executed in one cycle less. + bool isFalkorShiftExtFast(const MachineInstr &MI) const; private: + + /// \brief Sets the offsets on outlined instructions in \p MBB which use SP + /// so that they will be valid post-outlining. + /// + /// \param MBB A \p MachineBasicBlock in an outlined function. + void fixupPostOutline(MachineBasicBlock &MBB) const; + void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef<MachineOperand> Cond) const; @@ -283,7 +366,7 @@ enum AArch64FrameOffsetStatus { /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to /// use an offset.eq /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be -/// rewriten in @p MI. +/// rewritten in @p MI. /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the /// amount that is off the limit of the legal offset. /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 2244baa..5049a39 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -30,17 +30,29 @@ def HasLSE : Predicate<"Subtarget->hasLSE()">, AssemblerPredicate<"FeatureLSE", "lse">; def HasRAS : Predicate<"Subtarget->hasRAS()">, AssemblerPredicate<"FeatureRAS", "ras">; +def HasRDM : Predicate<"Subtarget->hasRDM()">, + AssemblerPredicate<"FeatureRDM", "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; def HasSPE : Predicate<"Subtarget->hasSPE()">, AssemblerPredicate<"FeatureSPE", "spe">; +def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, + AssemblerPredicate<"FeatureFuseAES", + "fuse-aes">; +def HasSVE : Predicate<"Subtarget->hasSVE()">, + AssemblerPredicate<"FeatureSVE", "sve">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; +def UseNegativeImmediates + : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + "NegativeImmediates">; + + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. // @@ -149,7 +161,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", - SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, [SDNPHasChain, SDNPOutGlue]>; def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", SDCallSeqEnd<[ SDTCisVT<0, i32>, @@ -305,10 +318,13 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; -def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; -def ForCodeSize : Predicate<"ForCodeSize">; -def NotForCodeSize : Predicate<"!ForCodeSize">; +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">; + def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">; +} include "AArch64InstrFormats.td" @@ -321,8 +337,9 @@ include "AArch64InstrFormats.td" let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>, Sched<[]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_start timm:$amt1, timm:$amt2)]>, + Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; @@ -424,8 +441,10 @@ def MSRpstateImm1 : MSRpstateImm0_1; def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is -// TPIDR_EL0. -def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +// TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects. +let hasSideEffects = 0 in +def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), + [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in @@ -574,31 +593,31 @@ def : Pat<(f64 fpimm:$in), // sequences. def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, tglobaladdr:$g1, tglobaladdr:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), - tglobaladdr:$g2, 32), - tglobaladdr:$g1, 16), - tglobaladdr:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0), + tglobaladdr:$g1, 16), + tglobaladdr:$g2, 32), + tglobaladdr:$g3, 48)>; def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, tblockaddress:$g1, tblockaddress:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), - tblockaddress:$g2, 32), - tblockaddress:$g1, 16), - tblockaddress:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0), + tblockaddress:$g1, 16), + tblockaddress:$g2, 32), + tblockaddress:$g3, 48)>; def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2, tconstpool:$g1, tconstpool:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), - tconstpool:$g2, 32), - tconstpool:$g1, 16), - tconstpool:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0), + tconstpool:$g1, 16), + tconstpool:$g2, 32), + tconstpool:$g3, 48)>; def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2, tjumptable:$g1, tjumptable:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), - tjumptable:$g2, 32), - tjumptable:$g1, 16), - tjumptable:$g0, 0)>; + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0), + tjumptable:$g1, 16), + tjumptable:$g2, 32), + tjumptable:$g3, 48)>; //===----------------------------------------------------------------------===// @@ -697,10 +716,10 @@ def : InstAlias<"negs $dst, $src$shift", defm UDIV : Div<0, "udiv", udiv>; defm SDIV : Div<1, "sdiv", sdiv>; -def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>; -def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>; -def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>; -def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>; +def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>; // Variable shift defm ASRV : Shift<0b10, "asr", sra>; @@ -718,7 +737,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>; def : ShiftAlias<"rorv", RORVXr, GPR64>; // Multiply-add -let AddedComplexity = 7 in { +let AddedComplexity = 5 in { defm MADD : MulAccum<0, "madd", add>; defm MSUB : MulAccum<1, "msub", sub>; @@ -735,7 +754,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; -} // AddedComplexity = 7 +} // AddedComplexity = 5 let AddedComplexity = 5 in { def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; @@ -2577,6 +2596,11 @@ def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, Sched<[WriteF]>; } +// Similarly add aliases +def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>, + Requires<[HasFullFP16]>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>; +def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>; //===----------------------------------------------------------------------===// // Floating point conversion instruction. @@ -2720,60 +2744,36 @@ defm FMOV : FPMoveImmediate<"fmov">; defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", int_aarch64_neon_uabd>; // Match UABDL in log2-shuffle patterns. +def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), (v8i16 (add (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), (zext (extract_high_v16i8 V128:$opB))), (AArch64vashr v8i16:$src, (i32 15))))), (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), - (v4i32 (add (sub (zext (v4i16 V64:$opA)), - (zext (v4i16 V64:$opB))), - (AArch64vashr v4i32:$src, (i32 31))))), +def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))))), (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), - (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), - (zext (extract_high_v8i16 V128:$opB))), - (AArch64vashr v4i32:$src, (i32 31))))), +def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))))), (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; -def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), - (v2i64 (add (sub (zext (v2i32 V64:$opA)), - (zext (v2i32 V64:$opB))), - (AArch64vashr v2i64:$src, (i32 63))))), +def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))))), (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), - (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), - (zext (extract_high_v4i32 V128:$opB))), - (AArch64vashr v2i64:$src, (i32 63))))), +def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))))), (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; -defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; -def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), - (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), - (ABSv8i8 V64:$src)>; -def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))), - (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))), - (ABSv4i16 V64:$src)>; -def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))), - (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))), - (ABSv2i32 V64:$src)>; -def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))), - (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))), - (ABSv16i8 V128:$src)>; -def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))), - (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))), - (ABSv8i16 V128:$src)>; -def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))), - (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))), - (ABSv4i32 V128:$src)>; -def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))), - (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))), - (ABSv2i64 V128:$src)>; - +defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>; @@ -3284,7 +3284,7 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl> defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; -let Predicates = [HasV8_1a] in { +let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; def : Pat<(i32 (int_aarch64_neon_sqadd @@ -3345,7 +3345,7 @@ def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd), // Advanced SIMD two scalar instructions. //===----------------------------------------------------------------------===// -defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>; +defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>; defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>; defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; @@ -5029,7 +5029,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)))>, Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>; - + def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), @@ -5307,6 +5307,31 @@ def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>; def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>; def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; +// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required +// for AES fusion on some CPUs. +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { +def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, + Sched<[WriteV]>; +} + +// Only use constrained versions of AES(I)MC instructions if they are paired with +// AESE/AESD. +def : Pat<(v16i8 (int_aarch64_crypto_aesmc + (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + +def : Pat<(v16i8 (int_aarch64_crypto_aesimc + (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1), + (v16i8 V128:$src2))))), + (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1), + (v16i8 V128:$src2)))))>, + Requires<[HasFuseAES]>; + def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>; def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>; def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index b514735..7e275e4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -12,17 +12,20 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#include "AArch64InstructionSelector.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterBankInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" @@ -30,19 +33,79 @@ #define DEBUG_TYPE "aarch64-isel" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL #error "You shouldn't build this" #endif +namespace { + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +class AArch64InstructionSelector : public InstructionSelector { +public: + AArch64InstructionSelector(const AArch64TargetMachine &TM, + const AArch64Subtarget &STI, + const AArch64RegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; + +private: + /// tblgen-erated 'select' implementation, used as the initial selector for + /// the patterns that don't require complex C++. + bool selectImpl(MachineInstr &I) const; + + bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + bool selectCompareBranch(MachineInstr &I, MachineFunction &MF, + MachineRegisterInfo &MRI) const; + + ComplexRendererFn selectArithImmed(MachineOperand &Root) const; + + const AArch64TargetMachine &TM; + const AArch64Subtarget &STI; + const AArch64InstrInfo &TII; + const AArch64RegisterInfo &TRI; + const AArch64RegisterBankInfo &RBI; + +#define GET_GLOBALISEL_PREDICATES_DECL #include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +// We declare the temporaries used by selectImpl() in the class to minimize the +// cost of constructing placeholder values. +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL AArch64InstructionSelector::AArch64InstructionSelector( const AArch64TargetMachine &TM, const AArch64Subtarget &STI, const AArch64RegisterBankInfo &RBI) - : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI) {} + : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "AArch64GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} // FIXME: This should be target-independent, inferred from the types declared // for each class in the bank. @@ -119,71 +182,39 @@ static bool unsupportedBinOp(const MachineInstr &I, } /// Select the AArch64 opcode for the basic binary operation \p GenericOpc -/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID +/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID /// and of size \p OpSize. /// \returns \p GenericOpc if the combination is unsupported. static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, unsigned OpSize) { switch (RegBankID) { case AArch64::GPRRegBankID: - if (OpSize <= 32) { - assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV && - GenericOpc != TargetOpcode::G_UDIV && - GenericOpc != TargetOpcode::G_LSHR && - GenericOpc != TargetOpcode::G_ASHR)) && - "operation should have been legalized before now"); - + if (OpSize == 32) { switch (GenericOpc) { - case TargetOpcode::G_OR: - return AArch64::ORRWrr; - case TargetOpcode::G_XOR: - return AArch64::EORWrr; - case TargetOpcode::G_AND: - return AArch64::ANDWrr; - case TargetOpcode::G_ADD: - assert(OpSize != 32 && "s32 G_ADD should have been selected"); - return AArch64::ADDWrr; - case TargetOpcode::G_SUB: - return AArch64::SUBWrr; case TargetOpcode::G_SHL: return AArch64::LSLVWr; case TargetOpcode::G_LSHR: return AArch64::LSRVWr; case TargetOpcode::G_ASHR: return AArch64::ASRVWr; - case TargetOpcode::G_SDIV: - return AArch64::SDIVWr; - case TargetOpcode::G_UDIV: - return AArch64::UDIVWr; default: return GenericOpc; } } else if (OpSize == 64) { switch (GenericOpc) { - case TargetOpcode::G_OR: - return AArch64::ORRXrr; - case TargetOpcode::G_XOR: - return AArch64::EORXrr; - case TargetOpcode::G_AND: - return AArch64::ANDXrr; case TargetOpcode::G_GEP: return AArch64::ADDXrr; - case TargetOpcode::G_SUB: - return AArch64::SUBXrr; case TargetOpcode::G_SHL: return AArch64::LSLVXr; case TargetOpcode::G_LSHR: return AArch64::LSRVXr; case TargetOpcode::G_ASHR: return AArch64::ASRVXr; - case TargetOpcode::G_SDIV: - return AArch64::SDIVXr; - case TargetOpcode::G_UDIV: - return AArch64::UDIVXr; default: return GenericOpc; } } + break; case AArch64::FPRRegBankID: switch (OpSize) { case 32: @@ -215,7 +246,8 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } } - }; + break; + } return GenericOpc; } @@ -239,6 +271,7 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, case 64: return isStore ? AArch64::STRXui : AArch64::LDRXui; } + break; case AArch64::FPRRegBankID: switch (OpSize) { case 8: @@ -250,7 +283,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, case 64: return isStore ? AArch64::STRDui : AArch64::LDRDui; } - }; + break; + } return GenericOpc; } @@ -473,6 +507,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P, } } +bool AArch64InstructionSelector::selectCompareBranch( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + + const unsigned CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + MachineInstr *CCMI = MRI.getVRegDef(CondReg); + if (CCMI->getOpcode() != TargetOpcode::G_ICMP) + return false; + + unsigned LHS = CCMI->getOperand(2).getReg(); + unsigned RHS = CCMI->getOperand(3).getReg(); + if (!getConstantVRegVal(RHS, MRI)) + std::swap(RHS, LHS); + + const auto RHSImm = getConstantVRegVal(RHS, MRI); + if (!RHSImm || *RHSImm != 0) + return false; + + const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI); + if (RB.getID() != AArch64::GPRRegBankID) + return false; + + const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate(); + if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ) + return false; + + const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits(); + unsigned CBOpc = 0; + if (CmpWidth <= 32) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW); + else if (CmpWidth == 64) + CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX); + else + return false; + + auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc)) + .addUse(LHS) + .addMBB(DestMBB); + + constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool AArch64InstructionSelector::selectVaStartAAPCS( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + return false; +} + +bool AArch64InstructionSelector::selectVaStartDarwin( + MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + unsigned ListReg = I.getOperand(0).getReg(); + + unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + + auto MIB = + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(ArgsAddrReg) + .addFrameIndex(FuncInfo->getVarArgsStackIndex()) + .addImm(0) + .addImm(0); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + + MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui)) + .addUse(ArgsAddrReg) + .addUse(ListReg) + .addImm(0) + .addMemOperand(*I.memoperands_begin()); + + constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + bool AArch64InstructionSelector::select(MachineInstr &I) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -549,6 +659,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const unsigned CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + if (selectCompareBranch(I, MF, MRI)) + return true; + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW)) .addUse(CondReg) .addImm(/*bit offset=*/0) @@ -558,6 +671,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI); } + case TargetOpcode::G_BRINDIRECT: { + I.setDesc(TII.get(AArch64::BR)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_CONSTANT: { const bool isFP = Opcode == TargetOpcode::G_FCONSTANT; @@ -629,9 +747,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { // FIXME: Is going through int64_t always correct? ImmOp.ChangeToImmediate( ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); - } else { + } else if (I.getOperand(1).isCImm()) { uint64_t Val = I.getOperand(1).getCImm()->getZExtValue(); I.getOperand(1).ChangeToImmediate(Val); + } else if (I.getOperand(1).isImm()) { + uint64_t Val = I.getOperand(1).getImm(); + I.getOperand(1).ChangeToImmediate(Val); } constrainSelectedInstRegOperands(I, TII, TRI, RBI); @@ -686,10 +807,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return false; } -#ifndef NDEBUG - // Sanity-check the pointer register. + auto &MemOp = **I.memoperands_begin(); + if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) { + DEBUG(dbgs() << "Atomic load/store not supported yet\n"); + return false; + } + const unsigned PtrReg = I.getOperand(1).getReg(); +#ifndef NDEBUG const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI); + // Sanity-check the pointer register. assert(PtrRB.getID() == AArch64::GPRRegBankID && "Load/Store pointer operand isn't a GPR"); assert(MRI.getType(PtrReg).isPointer() && @@ -706,11 +833,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(NewOpc)); - I.addOperand(MachineOperand::CreateImm(0)); + uint64_t Offset = 0; + auto *PtrMI = MRI.getVRegDef(PtrReg); + + // Try to fold a GEP into our unsigned immediate addressing mode. + if (PtrMI->getOpcode() == TargetOpcode::G_GEP) { + if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { + int64_t Imm = *COff; + const unsigned Size = MemTy.getSizeInBits() / 8; + const unsigned Scale = Log2_32(Size); + if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) { + unsigned Ptr2Reg = PtrMI->getOperand(1).getReg(); + I.getOperand(1).setReg(Ptr2Reg); + PtrMI = MRI.getVRegDef(Ptr2Reg); + Offset = Imm / Size; + } + } + } + + // If we haven't folded anything into our addressing mode yet, try to fold + // a frame index into the base+offset. + if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX) + I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex()); + + I.addOperand(MachineOperand::CreateImm(Offset)); + + // If we're storing a 0, use WZR/XZR. + if (auto CVal = getConstantVRegVal(ValReg, MRI)) { + if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) { + if (I.getOpcode() == AArch64::STRWui) + I.getOperand(0).setReg(AArch64::WZR); + else if (I.getOpcode() == AArch64::STRXui) + I.getOperand(0).setReg(AArch64::XZR); + } + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_MUL: { + case TargetOpcode::G_SMULH: + case TargetOpcode::G_UMULH: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) return false; @@ -719,48 +881,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); if (RB.getID() != AArch64::GPRRegBankID) { - DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n"); + DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n"); return false; } - unsigned ZeroReg; - unsigned NewOpc; - if (Ty.isScalar() && Ty.getSizeInBits() <= 32) { - NewOpc = AArch64::MADDWrrr; - ZeroReg = AArch64::WZR; - } else if (Ty == LLT::scalar(64)) { - NewOpc = AArch64::MADDXrrr; - ZeroReg = AArch64::XZR; - } else { - DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: " - << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n'); + if (Ty != LLT::scalar(64)) { + DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty + << ", expected: " << LLT::scalar(64) << '\n'); return false; } + unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr + : AArch64::UMULHrr; I.setDesc(TII.get(NewOpc)); - I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false)); - // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_FADD: case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: case TargetOpcode::G_OR: - case TargetOpcode::G_XOR: - case TargetOpcode::G_AND: case TargetOpcode::G_SHL: case TargetOpcode::G_LSHR: case TargetOpcode::G_ASHR: - case TargetOpcode::G_SDIV: - case TargetOpcode::G_UDIV: - case TargetOpcode::G_ADD: - case TargetOpcode::G_SUB: case TargetOpcode::G_GEP: { // Reject the various things we don't support yet. if (unsupportedBinOp(I, RBI, MRI, TRI)) @@ -783,6 +930,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } + case TargetOpcode::G_PTR_MASK: { + uint64_t Align = I.getOperand(2).getImm(); + if (Align >= 64 || Align == 0) + return false; + + uint64_t Mask = ~((1ULL << Align) - 1); + I.setDesc(TII.get(AArch64::ANDXri)); + I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } case TargetOpcode::G_PTRTOINT: case TargetOpcode::G_TRUNC: { const LLT DstTy = MRI.getType(I.getOperand(0).getReg()); @@ -795,7 +953,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); if (DstRB.getID() != SrcRB.getID()) { - DEBUG(dbgs() << "G_TRUNC input/output on different banks\n"); + DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n"); return false; } @@ -812,16 +970,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n"); return false; } if (DstRC == SrcRC) { // Nothing to be done + } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) && + SrcTy == LLT::scalar(64)) { + llvm_unreachable("TableGen can import this case"); + return false; } else if (DstRC == &AArch64::GPR32RegClass && SrcRC == &AArch64::GPR64RegClass) { I.getOperand(1).setSubReg(AArch64::sub_32); } else { + DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n"); return false; } @@ -1026,7 +1189,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { if (Ty == LLT::scalar(32)) { CSelOpc = AArch64::CSELWr; - } else if (Ty == LLT::scalar(64)) { + } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) { CSelOpc = AArch64::CSELXr; } else { return false; @@ -1134,7 +1297,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { .addDef(Def1Reg) .addUse(AArch64::WZR) .addUse(AArch64::WZR) - .addImm(CC1); + .addImm(getInvertedCondCode(CC1)); if (CC2 != AArch64CC::AL) { unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); @@ -1143,7 +1306,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { .addDef(Def2Reg) .addUse(AArch64::WZR) .addUse(AArch64::WZR) - .addImm(CC2); + .addImm(getInvertedCondCode(CC2)); MachineInstr &OrMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr)) .addDef(DefReg) @@ -1159,7 +1322,67 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { I.eraseFromParent(); return true; } + case TargetOpcode::G_VASTART: + return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) + : selectVaStartAAPCS(I, MF, MRI); + case TargetOpcode::G_IMPLICIT_DEF: + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + return true; } return false; } + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +InstructionSelector::ComplexRendererFn +AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const { + MachineInstr &MI = *Root.getParent(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + uint64_t Immed; + if (Root.isImm()) + Immed = Root.getImm(); + else if (Root.isCImm()) + Immed = Root.getCImm()->getZExtValue(); + else if (Root.isReg()) { + MachineInstr *Def = MRI.getVRegDef(Root.getReg()); + if (Def->getOpcode() != TargetOpcode::G_CONSTANT) + return nullptr; + MachineOperand &Op1 = Def->getOperand(1); + if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64) + return nullptr; + Immed = Op1.getCImm()->getZExtValue(); + } else + return nullptr; + + unsigned ShiftAmt; + + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return nullptr; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); }; +} + +namespace llvm { +InstructionSelector * +createAArch64InstructionSelector(const AArch64TargetMachine &TM, + AArch64Subtarget &Subtarget, + AArch64RegisterBankInfo &RBI) { + return new AArch64InstructionSelector(TM, Subtarget, RBI); +} +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h deleted file mode 100644 index 2c6e5a9..0000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h +++ /dev/null @@ -1,49 +0,0 @@ -//===- AArch64InstructionSelector --------------------------------*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file declares the targeting of the InstructionSelector class for -/// AArch64. -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H - -#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" - -namespace llvm { - -class AArch64InstrInfo; -class AArch64RegisterBankInfo; -class AArch64RegisterInfo; -class AArch64Subtarget; -class AArch64TargetMachine; - -class AArch64InstructionSelector : public InstructionSelector { -public: - AArch64InstructionSelector(const AArch64TargetMachine &TM, - const AArch64Subtarget &STI, - const AArch64RegisterBankInfo &RBI); - - bool select(MachineInstr &I) const override; - -private: - /// tblgen-erated 'select' implementation, used as the initial selector for - /// the patterns that don't require complex C++. - bool selectImpl(MachineInstr &I) const; - - const AArch64TargetMachine &TM; - const AArch64Subtarget &STI; - const AArch64InstrInfo &TII; - const AArch64RegisterInfo &TRI; - const AArch64RegisterBankInfo &RBI; -}; - -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 83f276a..ffb2783 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -13,9 +13,12 @@ //===----------------------------------------------------------------------===// #include "AArch64LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Type.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" #include "llvm/Target/TargetOpcodes.h" using namespace llvm; @@ -36,11 +39,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); - for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { + for (auto Ty : {p0, s1, s8, s16, s32, s64}) + setAction({G_IMPLICIT_DEF, Ty}, Legal); + + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) { // These operations naturally get the right answer when used on // GPR32, even if the actual type is narrower. - for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) + for (auto Ty : {s32, s64, v2s32, v4s32, v2s64}) setAction({BinOp, Ty}, Legal); + + for (auto Ty : {s1, s8, s16}) + setAction({BinOp, Ty}, WidenScalar); } setAction({G_GEP, p0}, Legal); @@ -49,7 +58,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { for (auto Ty : {s1, s8, s16, s32}) setAction({G_GEP, 1, Ty}, WidenScalar); - for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) { + setAction({G_PTR_MASK, p0}, Legal); + + for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) { for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); @@ -57,25 +68,47 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({BinOp, Ty}, WidenScalar); } - for (auto BinOp : { G_SREM, G_UREM }) + for (unsigned BinOp : {G_SREM, G_UREM}) for (auto Ty : { s1, s8, s16, s32, s64 }) setAction({BinOp, Ty}, Lower); - for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) { + for (unsigned Op : {G_SMULO, G_UMULO}) + setAction({Op, s64}, Lower); + + for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) { for (auto Ty : { s32, s64 }) setAction({Op, Ty}, Legal); setAction({Op, 1, s1}, Legal); } - for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV}) for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); - setAction({G_FREM, s32}, Libcall); - setAction({G_FREM, s64}, Libcall); + for (unsigned BinOp : {G_FREM, G_FPOW}) { + setAction({BinOp, s32}, Libcall); + setAction({BinOp, s64}, Libcall); + } + + for (auto Ty : {s32, s64, p0}) { + setAction({G_INSERT, Ty}, Legal); + setAction({G_INSERT, 1, Ty}, Legal); + } + for (auto Ty : {s1, s8, s16}) { + setAction({G_INSERT, Ty}, WidenScalar); + setAction({G_INSERT, 1, Ty}, Legal); + // FIXME: Can't widen the sources because that violates the constraints on + // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap). + } + + for (auto Ty : {s1, s8, s16, s32, s64, p0}) + setAction({G_EXTRACT, Ty}, Legal); + + for (auto Ty : {s32, s64}) + setAction({G_EXTRACT, 1, Ty}, Legal); - for (auto MemOp : {G_LOAD, G_STORE}) { + for (unsigned MemOp : {G_LOAD, G_STORE}) { for (auto Ty : {s8, s16, s32, s64, p0, v2s32}) setAction({MemOp, Ty}, Legal); @@ -141,12 +174,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_TRUNC, 1, Ty}, Legal); // Conversions - for (auto Ty : { s1, s8, s16, s32, s64 }) { + for (auto Ty : { s32, s64 }) { setAction({G_FPTOSI, 0, Ty}, Legal); setAction({G_FPTOUI, 0, Ty}, Legal); setAction({G_SITOFP, 1, Ty}, Legal); setAction({G_UITOFP, 1, Ty}, Legal); } + for (auto Ty : { s1, s8, s16 }) { + setAction({G_FPTOSI, 0, Ty}, WidenScalar); + setAction({G_FPTOUI, 0, Ty}, WidenScalar); + setAction({G_SITOFP, 1, Ty}, WidenScalar); + setAction({G_UITOFP, 1, Ty}, WidenScalar); + } for (auto Ty : { s32, s64 }) { setAction({G_FPTOSI, 1, Ty}, Legal); @@ -158,9 +197,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { // Control-flow for (auto Ty : {s1, s8, s16, s32}) setAction({G_BRCOND, Ty}, Legal); + setAction({G_BRINDIRECT, p0}, Legal); // Select - for (auto Ty : {s1, s8, s16, s32, s64}) + for (auto Ty : {s1, s8, s16}) + setAction({G_SELECT, Ty}, WidenScalar); + + for (auto Ty : {s32, s64, p0}) setAction({G_SELECT, Ty}, Legal); setAction({G_SELECT, 1, s1}, Legal); @@ -200,5 +243,81 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal); } + setAction({G_VASTART, p0}, Legal); + + // va_list must be a pointer, but most sized types are pretty easy to handle + // as the destination. + setAction({G_VAARG, 1, p0}, Legal); + + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({G_VAARG, Ty}, Custom); + computeTables(); } + +bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getOpcode()) { + default: + // No idea what to do. + return false; + case TargetOpcode::G_VAARG: + return legalizeVaArg(MI, MRI, MIRBuilder); + } + + llvm_unreachable("expected switch to return"); +} + +bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + MIRBuilder.setInstr(MI); + MachineFunction &MF = MIRBuilder.getMF(); + unsigned Align = MI.getOperand(2).getImm(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned ListPtr = MI.getOperand(1).getReg(); + + LLT PtrTy = MRI.getType(ListPtr); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const unsigned PtrSize = PtrTy.getSizeInBits() / 8; + unsigned List = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildLoad( + List, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + PtrSize, /* Align = */ PtrSize)); + + unsigned DstPtr; + if (Align > PtrSize) { + // Realign the list to the actual required alignment. + auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); + + unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg()); + + DstPtr = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); + } else + DstPtr = List; + + uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8; + MIRBuilder.buildLoad( + Dst, DstPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad, + ValSize, std::max(Align, PtrSize))); + + unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy); + MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize)); + + unsigned NewList = MRI.createGenericVirtualRegister(PtrTy); + MIRBuilder.buildGEP(NewList, DstPtr, SizeReg); + + MIRBuilder.buildStore( + NewList, ListPtr, + *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore, + PtrSize, /* Align = */ PtrSize)); + + MI.eraseFromParent(); + return true; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h index feacbef..42d4ac1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h @@ -25,6 +25,13 @@ class LLVMContext; class AArch64LegalizerInfo : public LegalizerInfo { public: AArch64LegalizerInfo(); + + bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + +private: + bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const; }; } // End llvm namespace. #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 8e312dc..9a7f45b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -18,17 +18,27 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <limits> + using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" @@ -58,15 +68,15 @@ typedef struct LdStPairFlags { // If a matching instruction is found, MergeForward is set to true if the // merge is to remove the first instruction and replace the second with // a pair-wise insn, and false if the reverse is true. - bool MergeForward; + bool MergeForward = false; // SExtIdx gives the index of the result of the load pair that must be // extended. The value of SExtIdx assumes that the paired load produces the // value in this order: (I, returned iterator), i.e., -1 means no value has // to be extended, 0 means I, and 1 means the returned iterator. - int SExtIdx; + int SExtIdx = -1; - LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + LdStPairFlags() = default; void setMergeForward(bool V = true) { MergeForward = V; } bool getMergeForward() const { return MergeForward; } @@ -78,10 +88,12 @@ typedef struct LdStPairFlags { struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); } + AliasAnalysis *AA; const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; const AArch64Subtarget *Subtarget; @@ -89,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). @@ -162,8 +179,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } }; + char AArch64LoadStoreOpt::ID = 0; -} // namespace + +} // end anonymous namespace INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", AARCH64_LOAD_STORE_OPT_NAME, false, false) @@ -246,7 +265,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, default: if (IsValidLdStrOpc) *IsValidLdStrOpc = false; - return UINT_MAX; + return std::numeric_limits<unsigned>::max(); case AArch64::STRDui: case AArch64::STURDi: case AArch64::STRQui: @@ -369,6 +388,10 @@ static unsigned isMatchingStore(MachineInstr &LoadInst, } static unsigned getPreIndexedOpcode(unsigned Opc) { + // FIXME: We don't currently support creating pre-indexed loads/stores when + // the load or store is the unscaled version. If we decide to perform such an + // optimization in the future the cases for the unscaled loads/stores will + // need to be added here. switch (Opc) { default: llvm_unreachable("Opcode has no pre-indexed equivalent!"); @@ -432,32 +455,42 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { default: llvm_unreachable("Opcode has no post-indexed wise equivalent!"); case AArch64::STRSui: + case AArch64::STURSi: return AArch64::STRSpost; case AArch64::STRDui: + case AArch64::STURDi: return AArch64::STRDpost; case AArch64::STRQui: + case AArch64::STURQi: return AArch64::STRQpost; case AArch64::STRBBui: return AArch64::STRBBpost; case AArch64::STRHHui: return AArch64::STRHHpost; case AArch64::STRWui: + case AArch64::STURWi: return AArch64::STRWpost; case AArch64::STRXui: + case AArch64::STURXi: return AArch64::STRXpost; case AArch64::LDRSui: + case AArch64::LDURSi: return AArch64::LDRSpost; case AArch64::LDRDui: + case AArch64::LDURDi: return AArch64::LDRDpost; case AArch64::LDRQui: + case AArch64::LDURQi: return AArch64::LDRQpost; case AArch64::LDRBBui: return AArch64::LDRBBpost; case AArch64::LDRHHui: return AArch64::LDRHHpost; case AArch64::LDRWui: + case AArch64::LDURWi: return AArch64::LDRWpost; case AArch64::LDRXui: + case AArch64::LDURXi: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; @@ -595,7 +628,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineInstrBuilder MIB; MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) - .addOperand(BaseRegOp) + .add(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*MergeMI)); (void)MIB; @@ -709,9 +742,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, } } MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) - .addOperand(RegOp0) - .addOperand(RegOp1) - .addOperand(BaseRegOp) + .add(RegOp0) + .add(RegOp1) + .add(BaseRegOp) .addImm(OffsetImm) .setMemRefs(I->mergeMemRefsWith(*Paired)); @@ -776,6 +809,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, int LoadSize = getMemScale(*LoadI); int StoreSize = getMemScale(*StoreI); unsigned LdRt = getLdStRegOp(*LoadI).getReg(); + const MachineOperand &StMO = getLdStRegOp(*StoreI); unsigned StRt = getLdStRegOp(*StoreI).getReg(); bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); @@ -788,7 +822,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // Remove the load, if the destination register of the loads is the same // register for stored value. if (StRt == LdRt && LoadSize == 8) { - StoreI->clearRegisterKills(StRt, TRI); + for (MachineInstr &MI : make_range(StoreI->getIterator(), + LoadI->getIterator())) { + if (MI.killsRegister(StRt, TRI)) { + MI.clearRegisterKills(StRt, TRI); + break; + } + } DEBUG(dbgs() << "Remove load instruction:\n "); DEBUG(LoadI->print(dbgs())); DEBUG(dbgs() << "\n"); @@ -800,7 +840,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) - .addReg(StRt) + .add(StMO) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { // FIXME: Currently we disable this transformation in big-endian targets as @@ -841,14 +881,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), DestReg) - .addReg(StRt) + .add(StMO) .addImm(AndMaskEncoded); } else { BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), DestReg) - .addReg(StRt) + .add(StMO) .addImm(Immr) .addImm(Imms); } @@ -857,7 +897,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, // Clear kill flags between store and load. for (MachineInstr &MI : make_range(StoreI->getIterator(), BitExtMI->getIterator())) - MI.clearRegisterKills(StRt, TRI); + if (MI.killsRegister(StRt, TRI)) { + MI.clearRegisterKills(StRt, TRI); + break; + } DEBUG(dbgs() << "Promoting load by replacing :\n "); DEBUG(StoreI->print(dbgs())); @@ -923,7 +966,7 @@ static int alignTo(int Num, int PowOf2) { } static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, - const AArch64InstrInfo *TII) { + AliasAnalysis *AA) { // One of the instructions must modify memory. if (!MIa.mayStore() && !MIb.mayStore()) return false; @@ -932,14 +975,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) return false; - return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); + return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); } static bool mayAlias(MachineInstr &MIa, SmallVectorImpl<MachineInstr *> &MemInsns, - const AArch64InstrInfo *TII) { + AliasAnalysis *AA) { for (MachineInstr *MIb : MemInsns) - if (mayAlias(MIa, *MIb, TII)) + if (mayAlias(MIa, *MIb, AA)) return true; return false; @@ -997,7 +1040,7 @@ bool AArch64LoadStoreOpt::findMatchingStore( return false; // If we encounter a store aliased with the load, return early. - if (MI.mayStore() && mayAlias(LoadMI, MI, TII)) + if (MI.mayStore() && mayAlias(LoadMI, MI, AA)) return false; } while (MBBI != B && Count < Limit); return false; @@ -1167,7 +1210,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // first. if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && - !mayAlias(MI, MemInsns, TII)) { + !mayAlias(MI, MemInsns, AA)) { Flags.setMergeForward(false); return MBBI; } @@ -1178,7 +1221,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // into the second. if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && - !mayAlias(FirstMI, MemInsns, TII)) { + !mayAlias(FirstMI, MemInsns, AA)) { Flags.setMergeForward(true); return MBBI; } @@ -1233,19 +1276,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, if (!isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(*Update)) - .addOperand(getLdStRegOp(*I)) - .addOperand(getLdStBaseOp(*I)) + .add(getLdStRegOp(*Update)) + .add(getLdStRegOp(*I)) + .add(getLdStBaseOp(*I)) .addImm(Value) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } else { // Paired instruction. int Scale = getMemScale(*I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(*Update)) - .addOperand(getLdStRegOp(*I, 0)) - .addOperand(getLdStRegOp(*I, 1)) - .addOperand(getLdStBaseOp(*I)) + .add(getLdStRegOp(*Update)) + .add(getLdStRegOp(*I, 0)) + .add(getLdStRegOp(*I, 1)) + .add(getLdStBaseOp(*I)) .addImm(Value / Scale) .setMemRefs(I->memoperands_begin(), I->memoperands_end()); } @@ -1545,7 +1588,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, case AArch64::LDURBBi: case AArch64::LDURHHi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: if (tryToPromoteLoadFromStore(MBBI)) { Modified = true; break; @@ -1553,7 +1596,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++MBBI; break; } - } } // 2) Merge adjacent zero stores into a wider store. // e.g., @@ -1666,8 +1708,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, ++NumPostFolded; break; } - // Don't know how to handle pre/post-index versions, so move to the next - // instruction. + + // Don't know how to handle unscaled pre/post-index versions below, so + // move to the next instruction. if (TII->isUnscaledLdSt(Opc)) { ++MBBI; break; @@ -1722,6 +1765,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); TRI = Subtarget->getRegisterInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); // Resize the modified and used register bitfield trackers. We do this once // per function and then clear the bitfield each time we optimize a load or diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 45083df..f82b9db 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, return MCOperand::createExpr(Expr); } +MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO, + MCSymbol *Sym) const { + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + return MCOperand::createExpr(Expr); +} + MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const { if (Printer.TM.getTargetTriple().isOSDarwin()) return lowerSymbolOperandDarwin(MO, Sym); + if (Printer.TM.getTargetTriple().isOSBinFormatCOFF()) + return lowerSymbolOperandCOFF(MO, Sym); - assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && - "Expect Darwin or ELF target"); + assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target"); return lowerSymbolOperandELF(MO, Sym); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h index 1e29b80..aa30fe1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h @@ -42,6 +42,8 @@ public: MCSymbol *Sym) const; MCOperand lowerSymbolOperandELF(const MachineOperand &MO, MCSymbol *Sym) const; + MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO, + MCSymbol *Sym) const; MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp new file mode 100644 index 0000000..963cfad --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp @@ -0,0 +1,166 @@ +//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AArch64 implementation of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MacroFusion.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +namespace { + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII); + const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI); + + // Assume wildcards for unspecified instrs. + unsigned FirstOpcode = + FirstMI ? FirstMI->getOpcode() + : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END); + unsigned SecondOpcode = SecondMI.getOpcode(); + + if (ST.hasArithmeticBccFusion()) + // Fuse CMN, CMP, TST followed by Bcc. + if (SecondOpcode == AArch64::Bcc) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDSWri: + case AArch64::ADDSWrr: + case AArch64::ADDSXri: + case AArch64::ADDSXrr: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::SUBSWri: + case AArch64::SUBSWrr: + case AArch64::SUBSXri: + case AArch64::SUBSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + return true; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !II.hasShiftedReg(*FirstMI); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + if (ST.hasArithmeticCbzFusion()) + // Fuse ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) + switch (FirstOpcode) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDWrr: + case AArch64::ADDXri: + case AArch64::ADDXrr: + case AArch64::ANDWri: + case AArch64::ANDWrr: + case AArch64::ANDXri: + case AArch64::ANDXrr: + case AArch64::EORWri: + case AArch64::EORWrr: + case AArch64::EORXri: + case AArch64::EORXrr: + case AArch64::ORRWri: + case AArch64::ORRWrr: + case AArch64::ORRXri: + case AArch64::ORRXrr: + case AArch64::SUBWri: + case AArch64::SUBWrr: + case AArch64::SUBXri: + case AArch64::SUBXrr: + return true; + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + // Shift value can be 0 making these behave like the "rr" variant... + return !II.hasShiftedReg(*FirstMI); + case AArch64::INSTRUCTION_LIST_END: + return true; + } + + if (ST.hasFuseAES()) + // Fuse AES crypto operations. + switch(SecondOpcode) { + // AES encode. + case AArch64::AESMCrr: + case AArch64::AESMCrrTied: + return FirstOpcode == AArch64::AESErr || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; + // AES decode. + case AArch64::AESIMCrr: + case AArch64::AESIMCrrTied: + return FirstOpcode == AArch64::AESDrr || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; + } + + if (ST.hasFuseLiterals()) + // Fuse literal generation operations. + switch (SecondOpcode) { + // PC relative address. + case AArch64::ADDXri: + return FirstOpcode == AArch64::ADRP || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; + // 32 bit immediate. + case AArch64::MOVKWi: + return (FirstOpcode == AArch64::MOVZWi && + SecondMI.getOperand(3).getImm() == 16) || + FirstOpcode == AArch64::INSTRUCTION_LIST_END; + // Lower and upper half of 64 bit immediate. + case AArch64::MOVKXi: + return FirstOpcode == AArch64::INSTRUCTION_LIST_END || + (FirstOpcode == AArch64::MOVZXi && + SecondMI.getOperand(3).getImm() == 16) || + (FirstOpcode == AArch64::MOVKXi && + FirstMI->getOperand(3).getImm() == 32 && + SecondMI.getOperand(3).getImm() == 48); + } + + return false; +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () { + return createMacroFusionDAGMutation(shouldScheduleAdjacent); +} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h new file mode 100644 index 0000000..32d90d4 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h @@ -0,0 +1,24 @@ +//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AArch64 definition of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createAArch64MacroFusionDAGMutation()); +/// to AArch64PassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation(); + +} // llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index 038162c..fe4ef4b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -17,8 +17,8 @@ #define DEBUG_TYPE "aarch64-pbqp" -#include "AArch64.h" #include "AArch64PBQPRegAlloc.h" +#include "AArch64.h" #include "AArch64RegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h index 4f656f9..b99c1d1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -1,4 +1,4 @@ -//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===// +//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -15,6 +15,8 @@ namespace llvm { +class TargetRegisterInfo; + /// Add the accumulator chaining constraint to a PBQP graph class A57ChainingConstraint : public PBQPRAConstraint { public: @@ -33,6 +35,7 @@ private: // Add constraints between existing chains void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); }; -} + +} // end namespace llvm #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index 8f45e6a..4e65c0a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -12,13 +12,14 @@ // CBZW %W0, <BB#2> // BB#2: // %W0 = COPY %WZR -// This pass should be run after register allocation. +// Similarly, this pass also handles non-zero copies. +// BB#0: +// cmp x0, #1 +// b.eq .LBB0_1 +// .LBB0_1: +// orr x0, xzr, #0x1 // -// FIXME: This should be extended to handle any constant other than zero. E.g., -// cmp w0, #1 -// b.eq .BB1 -// BB1: -// mov w0, #1 +// This pass should be run after register allocation. // // FIXME: This could also be extended to check the whole dominance subtree below // the comparison if the compile time regression is acceptable. @@ -26,6 +27,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -43,6 +45,7 @@ namespace { class AArch64RedundantCopyElimination : public MachineFunctionPass { const MachineRegisterInfo *MRI; const TargetRegisterInfo *TRI; + BitVector ClobberedRegs; public: static char ID; @@ -50,6 +53,16 @@ public: initializeAArch64RedundantCopyEliminationPass( *PassRegistry::getPassRegistry()); } + + struct RegImm { + MCPhysReg Reg; + int32_t Imm; + RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {} + }; + + Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr, + MachineBasicBlock *MBB, + MachineBasicBlock::iterator &FirstUse); bool optimizeCopy(MachineBasicBlock *MBB); bool runOnMachineFunction(MachineFunction &MF) override; MachineFunctionProperties getRequiredProperties() const override { @@ -66,18 +79,121 @@ char AArch64RedundantCopyElimination::ID = 0; INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim", "AArch64 redundant copy elimination pass", false, false) -static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) { - unsigned Opc = MI.getOpcode(); +/// Remember what registers the specified instruction modifies. +static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs, + const TargetRegisterInfo *TRI) { + for (const MachineOperand &MO : MI.operands()) { + if (MO.isRegMask()) { + ClobberedRegs.setBitsNotInMask(MO.getRegMask()); + continue; + } + + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + if (!MO.isDef()) + continue; + + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + ClobberedRegs.set(*AI); + } +} + +/// It's possible to determine the value of a register based on a dominating +/// condition. To do so, this function checks to see if the basic block \p MBB +/// is the target to which a conditional branch \p CondBr jumps and whose +/// equality comparison is against a constant. If so, return a known physical +/// register and constant value pair. Otherwise, return None. +Optional<AArch64RedundantCopyElimination::RegImm> +AArch64RedundantCopyElimination::knownRegValInBlock( + MachineInstr &CondBr, MachineBasicBlock *MBB, + MachineBasicBlock::iterator &FirstUse) { + unsigned Opc = CondBr.getOpcode(); + // Check if the current basic block is the target block to which the // CBZ/CBNZ instruction jumps when its Wt/Xt is zero. - if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && - MBB == MI.getOperand(1).getMBB()) - return true; - else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && - MBB != MI.getOperand(1).getMBB()) - return true; - - return false; + if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && + MBB == CondBr.getOperand(1).getMBB()) || + ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && + MBB != CondBr.getOperand(1).getMBB())) { + FirstUse = CondBr; + return RegImm(CondBr.getOperand(0).getReg(), 0); + } + + // Otherwise, must be a conditional branch. + if (Opc != AArch64::Bcc) + return None; + + // Must be an equality check (i.e., == or !=). + AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm(); + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return None; + + MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB(); + if ((CC == AArch64CC::EQ && BrTarget != MBB) || + (CC == AArch64CC::NE && BrTarget == MBB)) + return None; + + // Stop if we get to the beginning of PredMBB. + MachineBasicBlock *PredMBB = *MBB->pred_begin(); + assert(PredMBB == CondBr.getParent() && + "Conditional branch not in predecessor block!"); + if (CondBr == PredMBB->begin()) + return None; + + // Registers clobbered in PredMBB between CondBr instruction and current + // instruction being checked in loop. + ClobberedRegs.reset(); + + // Find compare instruction that sets NZCV used by CondBr. + MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator(); + for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) { + + // Track clobbered registers. + trackRegDefs(PredI, ClobberedRegs, TRI); + + bool IsCMN = false; + switch (PredI.getOpcode()) { + default: + break; + + // CMN is an alias for ADDS with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: + IsCMN = true; + LLVM_FALLTHROUGH; + // CMP is an alias for SUBS with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: { + MCPhysReg SrcReg = PredI.getOperand(1).getReg(); + + // Must not be a symbolic immediate. + if (!PredI.getOperand(2).isImm()) + return None; + + // The src register must not be modified between the cmp and conditional + // branch. This includes a self-clobbering compare. + if (ClobberedRegs[SrcReg]) + return None; + + // We've found the Cmp that sets NZCV. + int32_t KnownImm = PredI.getOperand(2).getImm(); + int32_t Shift = PredI.getOperand(3).getImm(); + KnownImm <<= Shift; + if (IsCMN) + KnownImm = -KnownImm; + FirstUse = PredI; + return RegImm(SrcReg, KnownImm); + } + } + + // Bail if we see an instruction that defines NZCV that we don't handle. + if (PredI.definesRegister(AArch64::NZCV)) + return None; + } + return None; } bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { @@ -85,79 +201,187 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) { if (MBB->pred_size() != 1) return false; + // Check if the predecessor has two successors, implying the block ends in a + // conditional branch. MachineBasicBlock *PredMBB = *MBB->pred_begin(); - MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr(); - if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2) + if (PredMBB->succ_size() != 2) + return false; + + MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr(); + if (CondBr == PredMBB->end()) return false; - ++CompBr; + // Keep track of the earliest point in the PredMBB block where kill markers + // need to be removed if a COPY is removed. + MachineBasicBlock::iterator FirstUse; + // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ + // or a compare (i.e., SUBS). In the latter case, we must take care when + // updating FirstUse when scanning for COPY instructions. In particular, if + // there's a COPY in between the compare and branch the COPY should not + // update FirstUse. + bool SeenFirstUse = false; + // Registers that contain a known value at the start of MBB. + SmallVector<RegImm, 4> KnownRegs; + + MachineBasicBlock::iterator Itr = std::next(CondBr); do { - --CompBr; - if (guaranteesZeroRegInBlock(*CompBr, MBB)) - break; - } while (CompBr != PredMBB->begin() && CompBr->isTerminator()); + --Itr; - // We've not found a CBZ/CBNZ, time to bail out. - if (!guaranteesZeroRegInBlock(*CompBr, MBB)) - return false; + Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse); + if (KnownRegImm == None) + continue; - unsigned TargetReg = CompBr->getOperand(0).getReg(); - if (!TargetReg) - return false; - assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) && - "Expect physical register"); + KnownRegs.push_back(*KnownRegImm); + + // Reset the clobber list, which is used by knownRegValInBlock. + ClobberedRegs.reset(); + + // Look backward in PredMBB for COPYs from the known reg to find other + // registers that are known to be a constant value. + for (auto PredI = Itr;; --PredI) { + if (FirstUse == PredI) + SeenFirstUse = true; + + if (PredI->isCopy()) { + MCPhysReg CopyDstReg = PredI->getOperand(0).getReg(); + MCPhysReg CopySrcReg = PredI->getOperand(1).getReg(); + for (auto &KnownReg : KnownRegs) { + if (ClobberedRegs[KnownReg.Reg]) + continue; + // If we have X = COPY Y, and Y is known to be zero, then now X is + // known to be zero. + if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) { + KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm)); + if (SeenFirstUse) + FirstUse = PredI; + break; + } + // If we have X = COPY Y, and X is known to be zero, then now Y is + // known to be zero. + if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) { + KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm)); + if (SeenFirstUse) + FirstUse = PredI; + break; + } + } + } + + // Stop if we get to the beginning of PredMBB. + if (PredI == PredMBB->begin()) + break; + + trackRegDefs(*PredI, ClobberedRegs, TRI); + // Stop if all of the known-zero regs have been clobbered. + if (all_of(KnownRegs, [&](RegImm KnownReg) { + return ClobberedRegs[KnownReg.Reg]; + })) + break; + } + break; + + } while (Itr != PredMBB->begin() && Itr->isTerminator()); - // Remember all registers aliasing with TargetReg. - SmallSetVector<unsigned, 8> TargetRegs; - for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI) - TargetRegs.insert(*AI); + // We've not found a registers with a known value, time to bail out. + if (KnownRegs.empty()) + return false; bool Changed = false; + // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB. + SmallSetVector<unsigned, 4> UsedKnownRegs; MachineBasicBlock::iterator LastChange = MBB->begin(); - unsigned SmallestDef = TargetReg; - // Remove redundant Copy instructions unless TargetReg is modified. + // Remove redundant Copy instructions unless KnownReg is modified. for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { MachineInstr *MI = &*I; ++I; - if (MI->isCopy() && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg()) { - - unsigned DefReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); - - if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) && - !MRI->isReserved(DefReg) && - (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) { - DEBUG(dbgs() << "Remove redundant Copy : "); - DEBUG((MI)->print(dbgs())); - - MI->eraseFromParent(); - Changed = true; - LastChange = I; - NumCopiesRemoved++; - SmallestDef = - TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef; - continue; + bool RemovedMI = false; + bool IsCopy = MI->isCopy(); + bool IsMoveImm = MI->isMoveImmediate(); + if (IsCopy || IsMoveImm) { + MCPhysReg DefReg = MI->getOperand(0).getReg(); + MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0; + int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0; + if (!MRI->isReserved(DefReg) && + ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) || + IsMoveImm)) { + for (RegImm &KnownReg : KnownRegs) { + if (KnownReg.Reg != DefReg && + !TRI->isSuperRegister(DefReg, KnownReg.Reg)) + continue; + + // For a copy, the known value must be a zero. + if (IsCopy && KnownReg.Imm != 0) + continue; + + if (IsMoveImm) { + // For a move immediate, the known immediate must match the source + // immediate. + if (KnownReg.Imm != SrcImm) + continue; + + // Don't remove a move immediate that implicitly defines the upper + // bits when only the lower 32 bits are known. + MCPhysReg CmpReg = KnownReg.Reg; + if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) { + return !O.isDead() && O.isReg() && O.isDef() && + O.getReg() != CmpReg; + })) + continue; + } + + if (IsCopy) + DEBUG(dbgs() << "Remove redundant Copy : " << *MI); + else + DEBUG(dbgs() << "Remove redundant Move : " << *MI); + + MI->eraseFromParent(); + Changed = true; + LastChange = I; + NumCopiesRemoved++; + UsedKnownRegs.insert(KnownReg.Reg); + RemovedMI = true; + break; + } } } - if (MI->modifiesRegister(TargetReg, TRI)) + // Skip to the next instruction if we removed the COPY/MovImm. + if (RemovedMI) + continue; + + // Remove any regs the MI clobbers from the KnownConstRegs set. + for (unsigned RI = 0; RI < KnownRegs.size();) + if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) { + std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]); + KnownRegs.pop_back(); + // Don't increment RI since we need to now check the swapped-in + // KnownRegs[RI]. + } else { + ++RI; + } + + // Continue until the KnownRegs set is empty. + if (KnownRegs.empty()) break; } if (!Changed) return false; - // Otherwise, we have to fixup the use-def chain, starting with the - // CBZ/CBNZ. Conservatively mark as much as we can live. - CompBr->clearRegisterKills(SmallestDef, TRI); + // Add newly used regs to the block's live-in list if they aren't there + // already. + for (MCPhysReg KnownReg : UsedKnownRegs) + if (!MBB->isLiveIn(KnownReg)) + MBB->addLiveIn(KnownReg); - if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); })) - MBB->addLiveIn(TargetReg); - - // Clear any kills of TargetReg between CompBr and the last removed COPY. + // Clear kills in the range where changes were made. This is conservative, + // but should be okay since kill markers are being phased out. + DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse + << "\tLastChange: " << *LastChange); + for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end())) + MMI.clearKillInfo(); for (MachineInstr &MMI : make_range(MBB->begin(), LastChange)) - MMI.clearRegisterKills(SmallestDef, TRI); + MMI.clearKillInfo(); return true; } @@ -168,6 +392,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction( return false; TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); + + // Resize the clobber register bitfield tracker. We do this once per + // function and then clear the bitfield each time we optimize a copy. + ClobberedRegs.resize(TRI->getNumRegs()); + bool Changed = false; for (MachineBasicBlock &MBB : MF) Changed |= optimizeCopy(&MBB); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index b292c9c..69124db 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -1,4 +1,4 @@ -//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==// +//===- AArch64RegisterBankInfo.cpp ----------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,13 +13,24 @@ //===----------------------------------------------------------------------===// #include "AArch64RegisterBankInfo.h" -#include "AArch64InstrInfo.h" // For XXXRegClassID. -#include "llvm/CodeGen/LowLevelType.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetOpcodes.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <cassert> + +#define GET_TARGET_REGBANK_IMPL +#include "AArch64GenRegisterBank.inc" // This file will be TableGen'ed at some point. #include "AArch64GenRegisterBankInfo.def" @@ -31,7 +42,7 @@ using namespace llvm; #endif AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) - : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) { + : AArch64GenRegisterBankInfo() { static bool AlreadyInit = false; // We have only one set of register banks, whatever the subtarget // is. Therefore, the initialization of the RegBanks table should be @@ -78,44 +89,21 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) // Check that the TableGen'ed like file is in sync we our expectations. // First, the Idx. - assert(AArch64::PartialMappingIdx::PMI_GPR32 == - AArch64::PartialMappingIdx::PMI_FirstGPR && - "GPR32 index not first in the GPR list"); - assert(AArch64::PartialMappingIdx::PMI_GPR64 == - AArch64::PartialMappingIdx::PMI_LastGPR && - "GPR64 index not last in the GPR list"); - assert(AArch64::PartialMappingIdx::PMI_FirstGPR <= - AArch64::PartialMappingIdx::PMI_LastGPR && - "GPR list is backward"); - assert(AArch64::PartialMappingIdx::PMI_FPR32 == - AArch64::PartialMappingIdx::PMI_FirstFPR && - "FPR32 index not first in the FPR list"); - assert(AArch64::PartialMappingIdx::PMI_FPR512 == - AArch64::PartialMappingIdx::PMI_LastFPR && - "FPR512 index not last in the FPR list"); - assert(AArch64::PartialMappingIdx::PMI_FirstFPR <= - AArch64::PartialMappingIdx::PMI_LastFPR && - "FPR list is backward"); - assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 == - AArch64::PartialMappingIdx::PMI_FPR64 && - AArch64::PartialMappingIdx::PMI_FPR64 + 1 == - AArch64::PartialMappingIdx::PMI_FPR128 && - AArch64::PartialMappingIdx::PMI_FPR128 + 1 == - AArch64::PartialMappingIdx::PMI_FPR256 && - AArch64::PartialMappingIdx::PMI_FPR256 + 1 == - AArch64::PartialMappingIdx::PMI_FPR512 && - "FPR indices not properly ordered"); + assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR, + {PMI_GPR32, PMI_GPR64}) && + "PartialMappingIdx's are incorrectly ordered"); + assert(checkPartialMappingIdx( + PMI_FirstFPR, PMI_LastFPR, + {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) && + "PartialMappingIdx's are incorrectly ordered"); // Now, the content. // Check partial mapping. #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \ do { \ - const PartialMapping &Map = \ - AArch64::PartMappings[AArch64::PartialMappingIdx::Idx - \ - AArch64::PartialMappingIdx::PMI_Min]; \ - (void)Map; \ - assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength && \ - Map.RegBank == &RB && #Idx " is incorrectly initialized"); \ - } while (0) + assert( \ + checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \ + #Idx " is incorrectly initialized"); \ + } while (false) CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR); CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR); @@ -128,17 +116,11 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) // Check value mapping. #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \ do { \ - unsigned PartialMapBaseIdx = \ - AArch64::PartialMappingIdx::PMI_##RBName##Size - \ - AArch64::PartialMappingIdx::PMI_Min; \ - (void)PartialMapBaseIdx; \ - const ValueMapping &Map = AArch64::getValueMapping( \ - AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset]; \ - (void)Map; \ - assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] && \ - Map.NumBreakDowns == 1 && #RBName #Size \ - " " #Offset " is incorrectly initialized"); \ - } while (0) + assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \ + PartialMappingIdx::PMI_First##RBName, Size, \ + Offset) && \ + #RBName #Size " " #Offset " is incorrectly initialized"); \ + } while (false) #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0) @@ -157,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) CHECK_VALUEMAP_IMPL(RBName, Size, 0); \ CHECK_VALUEMAP_IMPL(RBName, Size, 1); \ CHECK_VALUEMAP_IMPL(RBName, Size, 2); \ - } while (0) + } while (false) CHECK_VALUEMAP_3OPS(GPR, 32); CHECK_VALUEMAP_3OPS(GPR, 64); @@ -169,24 +151,23 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI) #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \ do { \ - unsigned PartialMapDstIdx = \ - AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min; \ - unsigned PartialMapSrcIdx = \ - AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min; \ - (void) PartialMapDstIdx; \ - (void) PartialMapSrcIdx; \ - const ValueMapping *Map = AArch64::getCopyMapping( \ - AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR, \ - AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size); \ - (void) Map; \ - assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] && \ + unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \ + unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \ + (void)PartialMapDstIdx; \ + (void)PartialMapSrcIdx; \ + const ValueMapping *Map = getCopyMapping( \ + AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \ + (void)Map; \ + assert(Map[0].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \ Map[0].NumBreakDowns == 1 && #RBNameDst #Size \ " Dst is incorrectly initialized"); \ - assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] && \ + assert(Map[1].BreakDown == \ + &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \ Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \ " Src is incorrectly initialized"); \ \ - } while (0) + } while (false) CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32); CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32); @@ -279,17 +260,15 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( if (MI.getNumOperands() != 3) break; InstructionMappings AltMappings; - InstructionMapping GPRMapping( - /*ID*/ 1, /*Cost*/ 1, - AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size), + const InstructionMapping &GPRMapping = getInstructionMapping( + /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size), /*NumOperands*/ 3); - InstructionMapping FPRMapping( - /*ID*/ 2, /*Cost*/ 1, - AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size), + const InstructionMapping &FPRMapping = getInstructionMapping( + /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size), /*NumOperands*/ 3); - AltMappings.emplace_back(std::move(GPRMapping)); - AltMappings.emplace_back(std::move(FPRMapping)); + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); return AltMappings; } case TargetOpcode::G_BITCAST: { @@ -303,29 +282,29 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( break; InstructionMappings AltMappings; - InstructionMapping GPRMapping( + const InstructionMapping &GPRMapping = getInstructionMapping( /*ID*/ 1, /*Cost*/ 1, - AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size), /*NumOperands*/ 2); - InstructionMapping FPRMapping( + const InstructionMapping &FPRMapping = getInstructionMapping( /*ID*/ 2, /*Cost*/ 1, - AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size), /*NumOperands*/ 2); - InstructionMapping GPRToFPRMapping( + const InstructionMapping &GPRToFPRMapping = getInstructionMapping( /*ID*/ 3, /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size), + getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size), /*NumOperands*/ 2); - InstructionMapping FPRToGPRMapping( + const InstructionMapping &FPRToGPRMapping = getInstructionMapping( /*ID*/ 3, /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size), - AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size), + getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size), /*NumOperands*/ 2); - AltMappings.emplace_back(std::move(GPRMapping)); - AltMappings.emplace_back(std::move(FPRMapping)); - AltMappings.emplace_back(std::move(GPRToFPRMapping)); - AltMappings.emplace_back(std::move(FPRToGPRMapping)); + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); + AltMappings.push_back(&GPRToFPRMapping); + AltMappings.push_back(&FPRToGPRMapping); return AltMappings; } case TargetOpcode::G_LOAD: { @@ -339,23 +318,21 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings( break; InstructionMappings AltMappings; - InstructionMapping GPRMapping( + const InstructionMapping &GPRMapping = getInstructionMapping( /*ID*/ 1, /*Cost*/ 1, - getOperandsMapping( - {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size), - // Addresses are GPR 64-bit. - AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}), + getOperandsMapping({getValueMapping(PMI_FirstGPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), /*NumOperands*/ 2); - InstructionMapping FPRMapping( + const InstructionMapping &FPRMapping = getInstructionMapping( /*ID*/ 2, /*Cost*/ 1, - getOperandsMapping( - {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size), - // Addresses are GPR 64-bit. - AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}), + getOperandsMapping({getValueMapping(PMI_FirstFPR, Size), + // Addresses are GPR 64-bit. + getValueMapping(PMI_FirstGPR, 64)}), /*NumOperands*/ 2); - AltMappings.emplace_back(std::move(GPRMapping)); - AltMappings.emplace_back(std::move(FPRMapping)); + AltMappings.push_back(&GPRMapping); + AltMappings.push_back(&FPRMapping); return AltMappings; } default: @@ -369,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl( switch (OpdMapper.getMI().getOpcode()) { case TargetOpcode::G_OR: case TargetOpcode::G_BITCAST: - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: // Those ID must match getInstrAlternativeMappings. assert((OpdMapper.getInstrMapping().getID() >= 1 && OpdMapper.getInstrMapping().getID() <= 4) && "Don't know how to handle that ID"); return applyDefaultMapping(OpdMapper); - } default: llvm_unreachable("Don't know how to handle that operation"); } @@ -397,8 +373,9 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { return false; } -RegisterBankInfo::InstructionMapping -AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) { +const RegisterBankInfo::InstructionMapping & +AArch64RegisterBankInfo::getSameKindOfOperandsMapping( + const MachineInstr &MI) const { const unsigned Opc = MI.getOpcode(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -411,6 +388,8 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) { unsigned Size = Ty.getSizeInBits(); bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc); + PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR; + #ifndef NDEBUG // Make sure all the operands are using similar size and type. // Should probably be checked by the machine verifier. @@ -422,23 +401,22 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) { // for each types. for (unsigned Idx = 1; Idx != NumOperands; ++Idx) { LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg()); - assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) == - AArch64::getRegBankBaseIdxOffset(Size) && - "Operand has incompatible size"); + assert( + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset( + RBIdx, OpTy.getSizeInBits()) == + AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) && + "Operand has incompatible size"); bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc); (void)OpIsFPR; assert(IsFPR == OpIsFPR && "Operand has incompatible type"); } #endif // End NDEBUG. - AArch64::PartialMappingIdx RBIdx = - IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR; - - return InstructionMapping{DefaultMappingID, 1, - AArch64::getValueMapping(RBIdx, Size), NumOperands}; + return getInstructionMapping(DefaultMappingID, 1, + getValueMapping(RBIdx, Size), NumOperands); } -RegisterBankInfo::InstructionMapping +const RegisterBankInfo::InstructionMapping & AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const unsigned Opc = MI.getOpcode(); const MachineFunction &MF = *MI.getParent()->getParent(); @@ -447,7 +425,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. if (!isPreISelGenericOpcode(Opc)) { - RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI); + const RegisterBankInfo::InstructionMapping &Mapping = + getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; } @@ -485,14 +464,11 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; const RegisterBank &SrcRB = SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank; - return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size), - AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size), - /*NumOperands*/ 2}; + return getInstructionMapping( + DefaultMappingID, copyCost(DstRB, SrcRB, Size), + getCopyMapping(DstRB.getID(), SrcRB.getID(), Size), + /*NumOperands*/ 2); } - case TargetOpcode::G_SEQUENCE: - // FIXME: support this, but the generic code is really not going to do - // anything sane. - return InstructionMapping(); default: break; } @@ -501,10 +477,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Track the size and bank of each register. We don't do partial mappings. SmallVector<unsigned, 4> OpSize(NumOperands); - SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands); + SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { auto &MO = MI.getOperand(Idx); - if (!MO.isReg()) + if (!MO.isReg() || !MO.getReg()) continue; LLT Ty = MRI.getType(MO.getReg()); @@ -513,9 +489,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs. // For floating-point instructions, scalars go in FPRs. if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc)) - OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR; + OpRegBankIdx[Idx] = PMI_FirstFPR; else - OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR; + OpRegBankIdx[Idx] = PMI_FirstGPR; } unsigned Cost = 1; @@ -523,50 +499,74 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // fine-tune the computed mapping. switch (Opc) { case TargetOpcode::G_SITOFP: - case TargetOpcode::G_UITOFP: { - OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR}; + case TargetOpcode::G_UITOFP: + OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR}; break; - } case TargetOpcode::G_FPTOSI: - case TargetOpcode::G_FPTOUI: { - OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR}; + case TargetOpcode::G_FPTOUI: + OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; break; - } - case TargetOpcode::G_FCMP: { - OpRegBankIdx = {AArch64::PMI_FirstGPR, - /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR, - AArch64::PMI_FirstFPR}; + case TargetOpcode::G_FCMP: + OpRegBankIdx = {PMI_FirstGPR, + /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR}; break; - } - case TargetOpcode::G_BITCAST: { + case TargetOpcode::G_BITCAST: // This is going to be a cross register bank copy and this is expensive. if (OpRegBankIdx[0] != OpRegBankIdx[1]) - Cost = - copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank, - *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]); + Cost = copyCost( + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank, + *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank, + OpSize[0]); break; - } - case TargetOpcode::G_LOAD: { + case TargetOpcode::G_LOAD: // Loading in vector unit is slightly more expensive. // This is actually only true for the LD1R and co instructions, // but anyway for the fast mode this number does not matter and // for the greedy mode the cost of the cross bank copy will // offset this number. // FIXME: Should be derived from the scheduling model. - if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR) + if (OpRegBankIdx[0] != PMI_FirstGPR) Cost = 2; - } + else + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + for (const MachineInstr &UseMI : + MRI.use_instructions(MI.getOperand(0).getReg())) + // If we have at least one direct use in a FP instruction, + // assume this was a floating point load in the IR. + // If it was not, we would have had a bitcast before + // reaching that instruction. + if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + break; + case TargetOpcode::G_STORE: + // Check if that store is fed by fp instructions. + if (OpRegBankIdx[0] == PMI_FirstGPR) { + unsigned VReg = MI.getOperand(0).getReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode())) + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } } // Finally construct the computed mapping. - RegisterBankInfo::InstructionMapping Mapping = - InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands}; SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); - for (unsigned Idx = 0; Idx < NumOperands; ++Idx) - if (MI.getOperand(Idx).isReg()) - OpdsMapping[Idx] = - AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) { + auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]); + if (!Mapping->isValid()) + return getInvalidInstructionMapping(); + + OpdsMapping[Idx] = Mapping; + } + } - Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping)); - return Mapping; + return getInstructionMapping(DefaultMappingID, Cost, + getOperandsMapping(OpdsMapping), NumOperands); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h index f763235..6d74a47 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -16,25 +16,78 @@ #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#define GET_REGBANK_DECLARATIONS +#include "AArch64GenRegisterBank.inc" + namespace llvm { class TargetRegisterInfo; -namespace AArch64 { -enum { - GPRRegBankID = 0, /// General Purpose Registers: W, X. - FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q. - CCRRegBankID = 2, /// Conditional register: NZCV. - NumRegisterBanks -}; +class AArch64GenRegisterBankInfo : public RegisterBankInfo { +protected: + + enum PartialMappingIdx { + PMI_None = -1, + PMI_FPR32 = 1, + PMI_FPR64, + PMI_FPR128, + PMI_FPR256, + PMI_FPR512, + PMI_GPR32, + PMI_GPR64, + PMI_FirstGPR = PMI_GPR32, + PMI_LastGPR = PMI_GPR64, + PMI_FirstFPR = PMI_FPR32, + PMI_LastFPR = PMI_FPR512, + PMI_Min = PMI_FirstFPR, + }; + + static RegisterBankInfo::PartialMapping PartMappings[]; + static RegisterBankInfo::ValueMapping ValMappings[]; + static PartialMappingIdx BankIDToCopyMapIdx[]; + + enum ValueMappingIdx { + InvalidIdx = 0, + First3OpsIdx = 1, + Last3OpsIdx = 19, + DistanceBetweenRegBanks = 3, + FirstCrossRegCpyIdx = 22, + LastCrossRegCpyIdx = 34, + DistanceBetweenCrossRegCpy = 2 + }; + + static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx, + unsigned ValLength, const RegisterBank &RB); + static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank, + unsigned Size, unsigned Offset); + static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias, + PartialMappingIdx LastAlias, + ArrayRef<PartialMappingIdx> Order); -extern RegisterBank GPRRegBank; -extern RegisterBank FPRRegBank; -extern RegisterBank CCRRegBank; -} // End AArch64 namespace. + static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping representing the RegisterBank + /// at \p RBIdx with a size of \p Size. + /// + /// The returned mapping works for instructions with the same kind of + /// operands for up to 3 operands. + /// + /// \pre \p RBIdx != PartialMappingIdx::None + static const RegisterBankInfo::ValueMapping * + getValueMapping(PartialMappingIdx RBIdx, unsigned Size); + + /// Get the pointer to the ValueMapping of the operands of a copy + /// instruction from the \p SrcBankID register bank to the \p DstBankID + /// register bank with a size of \p Size. + static const RegisterBankInfo::ValueMapping * + getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size); + +#define GET_TARGET_REGBANK_CLASS +#include "AArch64GenRegisterBank.inc" +}; /// This class provides the information for the target register banks. -class AArch64RegisterBankInfo final : public RegisterBankInfo { +class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo { /// See RegisterBankInfo::applyMapping. void applyMappingImpl(const OperandsMapper &OpdMapper) const override; @@ -45,8 +98,8 @@ class AArch64RegisterBankInfo final : public RegisterBankInfo { /// /// \return An InstructionMappings with a statically allocated /// OperandsMapping. - static InstructionMapping - getSameKindOfOperandsMapping(const MachineInstr &MI); + const InstructionMapping & + getSameKindOfOperandsMapping(const MachineInstr &MI) const; public: AArch64RegisterBankInfo(const TargetRegisterInfo &TRI); @@ -60,7 +113,8 @@ public: InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override; - InstructionMapping getInstrMapping(const MachineInstr &MI) const override; + const InstructionMapping & + getInstrMapping(const MachineInstr &MI) const override; }; } // End llvm namespace. #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td new file mode 100644 index 0000000..c2b6c0b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td @@ -0,0 +1,20 @@ +//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers: W, X. +def GPRRegBank : RegisterBank<"GPR", [GPR64all]>; + +/// Floating Point/Vector Registers: B, H, S, D, Q. +def FPRRegBank : RegisterBank<"FPR", [QQQQ]>; + +/// Conditional register: NZCV. +def CCRRegBank : RegisterBank<"CCR", [CCR]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 98fad71..9f7dcb3 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -74,7 +74,7 @@ const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; @@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { if (TT.isOSDarwin()) return CSR_AArch64_TLS_Darwin_RegMask; - assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS"); + assert(TT.isOSBinFormatELF() && "Invalid target"); return CSR_AArch64_TLS_ELF_RegMask; } @@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); - markSuperRegs(Reserved, AArch64::SP); - markSuperRegs(Reserved, AArch64::XZR); markSuperRegs(Reserved, AArch64::WSP); markSuperRegs(Reserved, AArch64::WZR); - if (TFI->hasFP(MF) || TT.isOSDarwin()) { - markSuperRegs(Reserved, AArch64::FP); + if (TFI->hasFP(MF) || TT.isOSDarwin()) markSuperRegs(Reserved, AArch64::W29); - } - if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { - markSuperRegs(Reserved, AArch64::X18); // Platform register - markSuperRegs(Reserved, AArch64::W18); - } + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) + markSuperRegs(Reserved, AArch64::W18); // Platform register - if (hasBasePointer(MF)) { - markSuperRegs(Reserved, AArch64::X19); + if (hasBasePointer(MF)) markSuperRegs(Reserved, AArch64::W19); - } assert(checkAllSuperRegsMarked(Reserved)); return Reserved; @@ -175,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const { const TargetRegisterClass * AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { - return &AArch64::GPR64RegClass; + return &AArch64::GPR64spRegClass; } const TargetRegisterClass * diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td index 93ca079..18d000a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -13,7 +13,7 @@ // ===---------------------------------------------------------------------===// // The following definitions describe the simpler per-operand machine model. -// This works with MachineScheduler. See MCSchedModel.h for details. +// This works with MachineScheduler. See MCSchedule.h for details. // Cortex-A53 machine model for scheduling and other instruction cost heuristics. def CortexA53Model : SchedMachineModel { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td index 99c48d0..5d1608e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// The Cortex-A57 is a traditional superscaler microprocessor with a +// The Cortex-A57 is a traditional superscalar microprocessor with a // conservative 3-wide in-order stage for decode and dispatch. Combined with the // much wider out-of-order issue stage, this produced a need to carefully // schedule micro-ops so that all three decoded each cycle are successfully @@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>; // Cryptography Extensions // ----------------------------------------------------------------------------- -def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +def A57ReadAES : SchedReadAdvance<3, [A57Write_3cyc_1W]>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>; +def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>; def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td index 19a6d6f..44fd94f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td @@ -17,10 +17,98 @@ // instruction cost model. def FalkorModel : SchedMachineModel { - let IssueWidth = 4; // 4-wide issue for expanded uops. + let IssueWidth = 8; // 8 uops are dispatched per cycle. let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer. let LoopMicroOpBufferSize = 16; let LoadLatency = 3; // Optimistic load latency. let MispredictPenalty = 11; // Minimum branch misprediction penalty. - let CompleteModel = 0; + let CompleteModel = 1; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Falkor. + +let SchedModel = FalkorModel in { + + def FalkorUnitB : ProcResource<1>; // Branch + def FalkorUnitLD : ProcResource<1>; // Load pipe + def FalkorUnitSD : ProcResource<1>; // Store data + def FalkorUnitST : ProcResource<1>; // Store pipe + def FalkorUnitX : ProcResource<1>; // Complex arithmetic + def FalkorUnitY : ProcResource<1>; // Simple arithmetic + def FalkorUnitZ : ProcResource<1>; // Simple arithmetic + + def FalkorUnitVSD : ProcResource<1>; // Vector store data + def FalkorUnitVX : ProcResource<1>; // Vector X-pipe + def FalkorUnitVY : ProcResource<1>; // Vector Y-pipe + + def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector + def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar + + // Define the resource groups. + def FalkorUnitXY : ProcResGroup<[FalkorUnitX, FalkorUnitY]>; + def FalkorUnitXYZ : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>; + def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ, + FalkorUnitB]>; + def FalkorUnitZB : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>; + def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>; + +} + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Falkor. + +let SchedModel = FalkorModel in { + +// These WriteRes entries are not used in the Falkor sched model. +def : WriteRes<WriteImm, []> { let Unsupported = 1; } +def : WriteRes<WriteI, []> { let Unsupported = 1; } +def : WriteRes<WriteISReg, []> { let Unsupported = 1; } +def : WriteRes<WriteIEReg, []> { let Unsupported = 1; } +def : WriteRes<WriteExtr, []> { let Unsupported = 1; } +def : WriteRes<WriteIS, []> { let Unsupported = 1; } +def : WriteRes<WriteID32, []> { let Unsupported = 1; } +def : WriteRes<WriteID64, []> { let Unsupported = 1; } +def : WriteRes<WriteIM32, []> { let Unsupported = 1; } +def : WriteRes<WriteIM64, []> { let Unsupported = 1; } +def : WriteRes<WriteBr, []> { let Unsupported = 1; } +def : WriteRes<WriteBrReg, []> { let Unsupported = 1; } +def : WriteRes<WriteLD, []> { let Unsupported = 1; } +def : WriteRes<WriteST, []> { let Unsupported = 1; } +def : WriteRes<WriteSTP, []> { let Unsupported = 1; } +def : WriteRes<WriteAdr, []> { let Unsupported = 1; } +def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteF, []> { let Unsupported = 1; } +def : WriteRes<WriteFCmp, []> { let Unsupported = 1; } +def : WriteRes<WriteFCvt, []> { let Unsupported = 1; } +def : WriteRes<WriteFCopy, []> { let Unsupported = 1; } +def : WriteRes<WriteFImm, []> { let Unsupported = 1; } +def : WriteRes<WriteFMul, []> { let Unsupported = 1; } +def : WriteRes<WriteFDiv, []> { let Unsupported = 1; } +def : WriteRes<WriteV, []> { let Unsupported = 1; } +def : WriteRes<WriteVLD, []> { let Unsupported = 1; } +def : WriteRes<WriteVST, []> { let Unsupported = 1; } +def : WriteRes<WriteSys, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Unsupported = 1; } +def : WriteRes<WriteHint, []> { let Unsupported = 1; } +def : WriteRes<WriteLDHi, []> { let Unsupported = 1; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// These ReadAdvance entries are not used in the Falkor sched model. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +// Detailed Refinements +// ----------------------------------------------------------------------------- +include "AArch64SchedFalkorDetails.td" + } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td new file mode 100644 index 0000000..0aeb1f3 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -0,0 +1,1288 @@ +//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the uop and latency details for the machine model for the +// Qualcomm Falkor subtarget. +// +//===----------------------------------------------------------------------===// + +// Contains all of the Falkor specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: FalkorWr +// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD) +// Latency: #cyc +// +// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued +// down one Z pipe, six SD pipes, four VX pipes and the total latency is +// six cycles. +// +// Contains all of the Falkor specific ReadAdvance types for forwarding logic. +// +// Contains all of the Falkor specific WriteVariant types for immediate zero +// and LSLFast. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define 0 micro-op types +def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} +def FalkorWr_none_3cyc : SchedWriteRes<[]> { + let Latency = 3; + let NumMicroOps = 0; +} +def FalkorWr_none_4cyc : SchedWriteRes<[]> { + let Latency = 4; + let NumMicroOps = 0; +} + +//===----------------------------------------------------------------------===// +// Define 1 micro-op types + +def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } +def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } +def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } +def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } +def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; } +def FalkorWr_1XYZ_0cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; } +def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; } +def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; } +def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; } +def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; } +def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; } + +def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; } +def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } +def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } +def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } +def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } +def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } + +def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } +def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } +def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; } + +def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; } +def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; } +def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; } +def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Define 2 micro-op types + +def FalkorWr_2VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 0; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 1; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} +def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} +def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_12cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 12; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 14; + let NumMicroOps = 2; +} + +def FalkorWr_1VX_1VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> { + let Latency = 21; + let NumMicroOps = 2; +} + +def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> { + let Latency = 4; + let NumMicroOps = 2; +} +def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2, 8]; +} + +def FalkorWr_1X_1Z_11cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [2, 11]; +} + +def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define 3 micro-op types + +def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 0; + let NumMicroOps = 3; +} + +def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitZ]> { + let Latency = 3; + let NumMicroOps = 3; +} + +def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> { + let Latency = 0; + let NumMicroOps = 3; +} +//===----------------------------------------------------------------------===// +// Define 4 micro-op types + +def FalkorWr_2VX_2VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 14; + let NumMicroOps = 4; +} + +def FalkorWr_2VX_2VY_20cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 20; + let NumMicroOps = 4; +} + +def FalkorWr_2VX_2VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 21; + let NumMicroOps = 4; +} + +def FalkorWr_2VX_2VY_24cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY, + FalkorUnitVX, FalkorUnitVY]> { + let Latency = 24; + let NumMicroOps = 4; +} + +def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 2; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 3; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} +def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST, + FalkorUnitSD, FalkorUnitLD]> { + let Latency = 3; + let NumMicroOps = 4; +} + +def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define 5 micro-op types + +def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 5; +} +def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitVXVY]> { + let Latency = 7; + let NumMicroOps = 5; +} +def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitST, + FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 5; +} +//===----------------------------------------------------------------------===// +// Define 6 micro-op types + +def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST, + FalkorUnitVSD, FalkorUnitXYZ, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 6; +} + +//===----------------------------------------------------------------------===// +// Define 8 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 8; +} + +def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 8; +} + +//===----------------------------------------------------------------------===// +// Define 9 micro-op types + +def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitLD, + FalkorUnitLD, FalkorUnitXYZ, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, + FalkorUnitLD, FalkorUnitVXVY, + FalkorUnitVXVY, FalkorUnitXYZ, + FalkorUnitLD, FalkorUnitLD, + FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 9; +} + +//===----------------------------------------------------------------------===// +// Define 10 micro-op types + +def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 10; +} + +//===----------------------------------------------------------------------===// +// Define 12 micro-op types + +def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD, + FalkorUnitVXVY, FalkorUnitST, + FalkorUnitVSD, FalkorUnitVXVY, + FalkorUnitST, FalkorUnitVSD]> { + let Latency = 0; + let NumMicroOps = 12; +} + +// Forwarding logic is modeled for multiply add/accumulate and +// load/store base register increment. +// ----------------------------------------------------------------------------- +def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; +def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; +def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; +def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; +def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; + +def FalkorReadIncLd : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>; +def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>; + +// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast +// ----------------------------------------------------------------------------- +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).isImm() && + MI->getOperand(1).getImm() == 0}]>; +def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR || + + MI->getOperand(1).getReg() == AArch64::XZR}]>; +def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>; + +def FalkorWr_FMOV : SchedWriteVariant<[ + SchedVar<FalkorOp1ZrReg, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>; + +def FalkorWr_MOVZ : SchedWriteVariant<[ + SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZB_0cyc]>]>; // imm fwd + + +def FalkorWr_ADDSUBsx : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>, + SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>; + +def FalkorWr_LDRro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>; + +def FalkorWr_LDRSro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>; + +def FalkorWr_ORRi : SchedWriteVariant<[ + SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1cyc]>]>; + +def FalkorWr_PRFMro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>; + +def FalkorWr_STRVro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>; + +def FalkorWr_STRQro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>; + +def FalkorWr_STRro : SchedWriteVariant<[ + SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>, + SchedVar<NoSchedPred, [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>; + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the earlier mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +// FIXME: This could be better modeled by looking at the regclasses of the operands. +def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>; + +// SIMD Floating-point Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)v2f32$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)v2f32$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FMULX32)>; + +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FMULX64)>; + +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>; + +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>; + +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVv2f32)>; +def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTv2f32)>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs FCVTLv8i16, FCVTLv4i32)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>; + +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>; + +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instregex "^(FMUL|FMULX)v2i64_indexed$")>; + +def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>; +def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>; + +def : InstRW<[FalkorWr_2VX_2VY_14cyc],(instrs FDIVv2f64)>; +def : InstRW<[FalkorWr_2VX_2VY_20cyc],(instrs FDIVv4f32)>; +def : InstRW<[FalkorWr_2VX_2VY_21cyc],(instrs FSQRTv2f64)>; +def : InstRW<[FalkorWr_2VX_2VY_24cyc],(instrs FSQRTv4f32)>; + +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], + (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], + (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; + +// SIMD Integer Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs ADDPv2i64p)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIC|ORR)(v2i32|v4i16)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHRd$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHLd$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)ADDLVv4i16v$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHRd$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], + (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; + +def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs ADDVv4i32v)>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs ADDVv8i16v)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(ADD|SUB)HNv.*$")>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>; + +def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs ADDVv16i8v)>; + +def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>; +def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^R(ADD|SUB)HNv.*$")>; + +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs ADDPv2i64)>; // sz==11 +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIC|ORR)(v8i16|v4i32)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>; + +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)ADDLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SUBLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11 +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL(v8i8|v16i8)$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>; + +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABDLv.*$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL(v1i64|v2i64)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; + +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; + +def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; + +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(S|U)ADDLVv8i16v$")>; + +def : InstRW<[FalkorWr_3VXVY_6cyc], (instregex "^(S|U)ADDLVv16i8v$")>; + +def : InstRW<[FalkorWr_4VXVY_2cyc], (instregex "^(S|U)(ADD|SUB)Wv.*$")>; + +def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; + +def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; + +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^SQD(MLAL|MLSL)v[248].*$")>; + +// SIMD Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instrs LD2i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instrs LD2i64_POST)>; + +def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD1i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD3i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instrs LD3i64_POST)>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD4i64)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd], + (instrs LD4i64_POST)>; + +def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD2i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], + (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instrs LD3Threev2d)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instrs LD3Threev2d_POST)>; +def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd], + (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], + (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instrs LD4Fourv2d)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instrs LD4Fourv2d_POST)>; +def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd], + (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4i(8|16|32)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(8b|4h|2s)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(8b|4h|2s)_POST$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(16b|8h|4s)$")>; + +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD3Threev(16b|8h|4s)_POST$")>; + +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADD(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_ORRi], (instregex "^ORR(W|X)ri$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(r|s)$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>; +def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>; + +// SIMD Miscellaneous Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; + +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instrs FRECPS64, FRSQRTS64)>; + +def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], + (instregex "^INSv(i32|i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; +def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>; + +def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; + +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], + (instrs FRECPSv4f32, FRSQRTSv4f32)>; + +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], + (instrs FRECPSv2f64, FRSQRTSv2f64)>; + +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; +def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; + +def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBL(v8i8Four|v16i8Three)$")>; +def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBX(v8i8Three|v16i8Three)$")>; + +def : InstRW<[FalkorWr_5VXVY_7cyc], (instrs TBLv16i8Four)>; +def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>; + +// SIMD Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(Q|D|S|H|B)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt], + (instregex "^STR(D|S|H|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STPQi$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STPQ(post|pre)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(D|S)(i)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(D|S)(post|pre)$")>; +def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt], + (instregex "^STRQro(W|X)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STUR(Q|D|S|B|H)i$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPDi, STNPSi)>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPQi)>; + +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>; +def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>; +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>; + +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4(i8|i16|i32|i64)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3(i8|i16|i32|i64)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4(i8|i16|i32|i64)_POST$")>; + +def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v8b|v4h|v2s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>; + +def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST3Threev2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST3Threev2d_POST)>; + +def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v8b|v4h|v2s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>; + +def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>; +def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST4Fourv2d)>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt], + (instrs ST4Fourv2d_POST)>; + +def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>; + +def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v16b|v8h|v4s)$")>; +// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case). +def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>; + +// Branch Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1none_0cyc], (instrs B, TCRETURNdi)>; +def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>; +def : InstRW<[FalkorWr_1Z_0cyc], (instrs RET_ReallyLR, TCRETURNri)>; +def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>; +def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>; +def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>; + +// Cryptography Extensions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs SHA1Hrr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs AESIMCrr, AESMCrr)>; +def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs AESDrr, AESErr)>; +def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>; +def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>; +def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>; +def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>; + +// FP Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDUR(Q|D|S|H|B)i$")>; +def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd], + (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instrs LDNPQi)>; +def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instrs LDPQi)>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "LDNP(D|S)i$")>; +def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "LDP(D|S)i$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "LDP(D|S)(pre|post)$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^LDPQ(pre|post)$")>; + +// FP Data Processing Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(S|D)r$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(S|D)rrr$")>; + +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTSHr, FCVTDHr)>; +def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>; + +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(32|64)$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(S|D)rr$")>; +def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTHSr, FCVTHDr)>; + +def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], + (instregex "^F(N)?MULSrr$")>; + +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], + (instregex "^F(N)?MULDrr$")>; + +def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVSrr)>; +def : InstRW<[FalkorWr_1VX_1VY_14cyc],(instrs FDIVDrr)>; +def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTSr)>; +def : InstRW<[FalkorWr_1VX_1VY_21cyc],(instrs FSQRTDr)>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], + (instregex "^F(N)?M(ADD|SUB)Srrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], + (instregex "^F(N)?M(ADD|SUB)Drrr$")>; + +// FP Miscellaneous Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>; +def : InstRW<[FalkorWr_1GTOV_0cyc], (instregex "^FMOV(S|D)i$")>; // imm fwd +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>; +def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd +// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr +def : InstRW<[FalkorWr_2VXVY_0cyc], (instrs FMOVD0, FMOVS0)>; // imm fwd + +def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; +def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>; + +def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>; + +// Load Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>; +def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^LDNP(W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^LDP(W|X)i$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^LDP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd], + (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDR(W|X)l$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd], + (instrs LDPSWi)>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd], + (instregex "^LDPSW(post|pre)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; +def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd], + (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instrs LDRSWl)>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd], + (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; + +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>; +def : InstRW<[FalkorWr_1X_2cyc], (instregex "^CRC32.*$")>; +def : InstRW<[FalkorWr_1XYZ_2cyc], (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>; +def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], + (instregex "^M(ADD|SUB)Wrrr$")>; + +def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], + (instregex "^M(ADD|SUB)Xrrr$")>; + +def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[FalkorWr_1X_1Z_11cyc], (instregex "^(S|U)DIVXr$")>; + +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], + (instregex "^(S|U)MULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], + (instregex "^(S|U)(MLAL|MLSL)v.*$")>; + +// Move and Shift Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instregex "^MOVK(W|X)i$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^ADRP?$")>; // imm fwd +def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^MOVN(W|X)i$")>; // imm fwd +def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>; +def : InstRW<[FalkorWr_1XYZ_0cyc], (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation) +def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>], + (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>; +def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>], + (instrs LOADgot)>; + +// Other Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1LD_0cyc], (instrs CLREX, DMB, DSB)>; +def : InstRW<[FalkorWr_1none_0cyc], (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>; +def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>; +def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>; + +def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], + (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>; +def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd], + (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>; +def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>; + +def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>; + +def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instrs STNPWi, STNPXi)>; +def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>; + +def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STXP(W|X)$")>; +def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STXR(B|H|W|X)$")>; + +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STLXP(W|X)$")>; +def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STLXR(B|H|W|X)$")>; + +// Store Instructions +// ----------------------------------------------------------------------------- +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(W|X)i$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt], + (instregex "^STP(W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(BB|HH|W|X)ui$")>; +def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt], + (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt], + (instregex "^STUR(BB|HH|W|X)i$")>; + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td index 426ae61..cf4cdab 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td @@ -776,23 +776,29 @@ def KryoWrite_4cyc_X_X_115ln : } def : InstRW<[KryoWrite_4cyc_X_X_115ln], (instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>; -def KryoWrite_1cyc_XA_Y_noRSV_43ln : +def KryoWrite_10cyc_XA_Y_noRSV_43ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 10; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln], - (instrs FDIVDrr, FDIVSrr)>; -def KryoWrite_1cyc_XA_Y_noRSV_121ln : +def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_43ln], + (instrs FDIVSrr)>; +def KryoWrite_14cyc_XA_Y_noRSV_43ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 14; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln], +def : InstRW<[KryoWrite_14cyc_XA_Y_noRSV_43ln], + (instrs FDIVDrr)>; +def KryoWrite_10cyc_XA_Y_noRSV_121ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 10; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_121ln], (instrs FDIVv2f32)>; -def KryoWrite_1cyc_XA_Y_XA_Y_123ln : +def KryoWrite_14cyc_XA_Y_XA_Y_123ln : SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 4; + let Latency = 14; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln], +def : InstRW<[KryoWrite_14cyc_XA_Y_XA_Y_123ln], (instrs FDIVv2f64, FDIVv4f32)>; def KryoWrite_5cyc_X_noRSV_55ln : SchedWriteRes<[KryoUnitX]> { @@ -968,24 +974,36 @@ def KryoWrite_2cyc_XY_XY_109ln : } def : InstRW<[KryoWrite_2cyc_XY_XY_109ln], (instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>; -def KryoWrite_1cyc_XA_Y_noRSV_42ln : +def KryoWrite_12cyc_XA_Y_noRSV_42ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 12; let NumMicroOps = 3; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln], - (instregex "FSQRT(S|D)r")>; -def KryoWrite_1cyc_XA_Y_noRSV_120ln : +def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_42ln], + (instrs FSQRTSr)>; +def KryoWrite_21cyc_XA_Y_noRSV_42ln : SchedWriteRes<[KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 3; + let Latency = 21; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_21cyc_XA_Y_noRSV_42ln], + (instrs FSQRTDr)>; +def KryoWrite_12cyc_XA_Y_noRSV_120ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY]> { + let Latency = 12; let NumMicroOps = 3; +} +def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_120ln], + (instrs FSQRTv2f32)>; +def KryoWrite_21cyc_XA_Y_XA_Y_122ln : + SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { + let Latency = 21; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln], - (instregex "FSQRTv2f32")>; -def KryoWrite_1cyc_XA_Y_XA_Y_122ln : +def : InstRW<[KryoWrite_21cyc_XA_Y_XA_Y_122ln], + (instrs FSQRTv4f32)>; +def KryoWrite_36cyc_XA_Y_XA_Y_122ln : SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> { - let Latency = 1; let NumMicroOps = 4; + let Latency = 36; let NumMicroOps = 4; } -def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln], - (instregex "FSQRT(v2f64|v4f32)")>; +def : InstRW<[KryoWrite_36cyc_XA_Y_XA_Y_122ln], + (instrs FSQRTv2f64)>; def KryoWrite_1cyc_X_201ln : SchedWriteRes<[KryoUnitX]> { let Latency = 1; let NumMicroOps = 1; @@ -1356,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln : let Latency = 3; let NumMicroOps = 2; } def : InstRW<[KryoWrite_3cyc_LS_LS_400ln], - (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>; + (instregex "LDAX?R(B|H|W|X)")>; +def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi], + (instregex "LDAXP(W|X)")>; def KryoWrite_3cyc_LS_LS_401ln : SchedWriteRes<[KryoUnitLS, KryoUnitLS]> { let Latency = 3; let NumMicroOps = 2; @@ -1547,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln : SchedWriteRes<[KryoUnitLS]> { let Latency = 3; let NumMicroOps = 1; } -def : InstRW<[KryoWrite_3cyc_LS_258ln], +def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi], (instregex "LDXP(W|X)")>; def KryoWrite_3cyc_LS_258_1ln : SchedWriteRes<[KryoUnitLS]> { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td index 14d6891..3b71cf8 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td @@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel { let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. let LoadLatency = 4; // Optimistic load cases. let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 0; // Use the default model otherwise. + let CompleteModel = 1; // Use the default model otherwise. } //===----------------------------------------------------------------------===// @@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, +def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteL5]>]>; def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; } def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, +def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2, M1WriteA1]>, SchedVar<NoSchedPred, [M1WriteS1]>]>; @@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; } // Load instructions. def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; } -def : SchedAlias<WriteLDIdx, M1WriteLA>; +def : SchedAlias<WriteLDIdx, M1WriteLX>; // Store instructions. def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; } def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; } -def : SchedAlias<WriteSTIdx, M1WriteSA>; +def : SchedAlias<WriteSTIdx, M1WriteSX>; // FP data instructions. def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } @@ -231,6 +231,111 @@ def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } def M1WriteTB : SchedWriteRes<[M1UnitC, M1UnitALU]> { let Latency = 2; } +def M1WriteVLDA : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 6; } +def M1WriteVLDB : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 7; } +def M1WriteVLDC : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 8; } +def M1WriteVLDD : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDE : SchedWriteRes<[M1UnitL, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDF : SchedWriteRes<[M1UnitL, + M1UnitL]> { let Latency = 10; + let ResourceCycles = [5]; } +def M1WriteVLDG : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDH : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU]> { let Latency = 6; } +def M1WriteVLDI : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 12; + let ResourceCycles = [6]; } +def M1WriteVLDJ : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDK : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 9; + let ResourceCycles = [4]; } +def M1WriteVLDL : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDM : SchedWriteRes<[M1UnitL, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU, + M1UnitNALU]> { let Latency = 7; + let ResourceCycles = [2]; } +def M1WriteVLDN : SchedWriteRes<[M1UnitL, + M1UnitL, + M1UnitL, + M1UnitL]> { let Latency = 14; + let ResourceCycles = [7]; } + +def M1WriteVSTA : WriteSequence<[WriteVST], 2>; +def M1WriteVSTB : WriteSequence<[WriteVST], 3>; +def M1WriteVSTC : WriteSequence<[WriteVST], 4>; +def M1WriteVSTD : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 7; + let ResourceCycles = [7]; } +def M1WriteVSTE : SchedWriteRes<[M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST]> { let Latency = 8; + let ResourceCycles = [8]; } +def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 15; + let ResourceCycles = [15]; } +def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 16; + let ResourceCycles = [16]; } +def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 14; + let ResourceCycles = [14]; } +def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitS, + M1UnitFST, + M1UnitFST, + M1UnitFST]> { let Latency = 17; + let ResourceCycles = [17]; } // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; @@ -360,13 +465,239 @@ def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64 def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; // ASIMD load instructions. +def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[M1WriteVLDD, + WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; +def : InstRW<[M1WriteVLDE, + WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteL5, + WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>; +def : InstRW<[M1WriteVLDG, + WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>; +def : InstRW<[M1WriteVLDH, + WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVLDA, + WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>; +def : InstRW<[M1WriteVLDF, + WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>; +def : InstRW<[M1WriteVLDJ, + WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>; +def : InstRW<[M1WriteVLDL, + WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>; +def : InstRW<[M1WriteVLDB, + WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>; +def : InstRW<[M1WriteVLDI, + WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>; +def : InstRW<[M1WriteVLDK, + WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>; +def : InstRW<[M1WriteVLDM, + WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>; +def : InstRW<[M1WriteVLDC, + WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[M1WriteVLDN, + WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; // ASIMD store instructions. +def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTA, + WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTB, + WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M1WriteVSTC, + WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTD, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>; +def : InstRW<[M1WriteVSTE, + WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>; +def : InstRW<[M1WriteVSTG, + WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>; +def : InstRW<[M1WriteVSTH, + WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M1WriteVSTF, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[M1WriteVSTI, + WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; // Cryptography instructions. def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; -def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>; +def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>; +def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td new file mode 100644 index 0000000..9a0cb70 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td @@ -0,0 +1,352 @@ +//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM ThunderX T8X +// (T88, T81, T83) processors. +// Loosely based on Cortex-A53 which is somewhat similar. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details. + +// Cavium ThunderX T8X scheduling machine model. +def ThunderXT8XModel : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops dispatched per cycle. + let MicroOpBufferSize = 0; // ThunderX T88/T81/T83 are in-order. + let LoadLatency = 3; // Optimistic load latency. + let MispredictPenalty = 8; // Branch mispredict penalty. + let PostRAScheduler = 1; // Use PostRA scheduler. + let CompleteModel = 1; +} + +// Modeling each pipeline with BufferSize == 0 since T8X is in-order. +def THXT8XUnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def THXT8XUnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def THXT8XUnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def THXT8XUnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def THXT8XUnitBr : ProcResource<1> { let BufferSize = 0; } // Branch +def THXT8XUnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def THXT8XUnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types mapping the ProcResources and +// latencies. + +let SchedModel = ThunderXT8XModel in { + +// ALU +def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; } +def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; } +def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; } +def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; } + +// MAC +def : WriteRes<WriteIM32, [THXT8XUnitMAC]> { + let Latency = 4; + let ResourceCycles = [1]; +} + +def : WriteRes<WriteIM64, [THXT8XUnitMAC]> { + let Latency = 4; + let ResourceCycles = [1]; +} + +// Div +def : WriteRes<WriteID32, [THXT8XUnitDiv]> { + let Latency = 12; + let ResourceCycles = [6]; +} + +def : WriteRes<WriteID64, [THXT8XUnitDiv]> { + let Latency = 14; + let ResourceCycles = [8]; +} + +// Load +def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; } +def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; } +def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; } + +// Vector Load +def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> { + let Latency = 8; + let ResourceCycles = [3]; +} + +def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 6; + let ResourceCycles = [1]; +} + +def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 11; + let ResourceCycles = [7]; +} + +def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 12; + let ResourceCycles = [8]; +} + +def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 13; + let ResourceCycles = [9]; +} + +def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 13; + let ResourceCycles = [9]; +} + +// Pre/Post Indexing +def : WriteRes<WriteAdr, []> { let Latency = 0; } + +// Store +def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; } +def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; } + +// Vector Store +def : WriteRes<WriteVST, [THXT8XUnitLdSt]>; +def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>; + +def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 10; + let ResourceCycles = [9]; +} + +def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> { + let Latency = 11; + let ResourceCycles = [10]; +} + +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// Branch +def : WriteRes<WriteBr, [THXT8XUnitBr]>; +def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>; +def : WriteRes<WriteBrReg, [THXT8XUnitBr]>; +def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>; +def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>; +def : WriteRes<WriteSys, [THXT8XUnitBr]>; +def : WriteRes<WriteBarrier, [THXT8XUnitBr]>; +def : WriteRes<WriteHint, [THXT8XUnitBr]>; + +// FP ALU +def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; } +def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; } + +def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 12; + let ResourceCycles = [9]; +} + +def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 17; + let ResourceCycles = [14]; +} + +def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> { + let Latency = 31; + let ResourceCycles = [28]; +} + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// No forwarding for these reads. +def : ReadAdvance<ReadExtrHi, 1>; +def : ReadAdvance<ReadAdrBase, 2>; +def : ReadAdvance<ReadVLD, 2>; + +// FIXME: This needs more targeted benchmarking. +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycles later. This same +// ReadAdvance applies to Extended registers as well, even though there is +// a separate SchedPredicate for them. +def : ReadAdvance<ReadI, 2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def THXT8XReadISReg : SchedReadVariant<[ + SchedVar<RegShiftedPred, [THXT8XReadShifted]>, + SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>; +def : SchedAlias<ReadISReg, THXT8XReadISReg>; + +def THXT8XReadIEReg : SchedReadVariant<[ + SchedVar<RegExtendedPred, [THXT8XReadShifted]>, + SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>; +def : SchedAlias<ReadIEReg, THXT8XReadIEReg>; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; + +// Div +def : ReadAdvance<ReadID, 1, [WriteImm, WriteI, + WriteISReg, WriteIEReg, WriteIS, + WriteID32, WriteID64, + WriteIM32, WriteIM64]>; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRW. + +//--- +// Branch +//--- +def : InstRW<[THXT8XWriteBR], (instregex "^B")>; +def : InstRW<[THXT8XWriteBR], (instregex "^BL")>; +def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>; +def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>; +def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>; +def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>; + +//--- +// Ret +//--- +def : InstRW<[THXT8XWriteRET], (instregex "^RET")>; + +//--- +// Miscellaneous +//--- +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads +//--- +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td new file mode 100644 index 0000000..10df50b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td @@ -0,0 +1,1745 @@ +//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Cavium ThunderX2T99 +// processors. +// Based on Broadcom Vulcan. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 2. Pipeline Description. + +def ThunderX2T99Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 32; + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; +} + +// Define the issue ports. + +// Port 0: ALU, FP/SIMD. +def THX2T99P0 : ProcResource<1>; + +// Port 1: ALU, FP/SIMD, integer mul/div. +def THX2T99P1 : ProcResource<1>; + +// Port 2: ALU, Branch. +def THX2T99P2 : ProcResource<1>; + +// Port 3: Store data. +def THX2T99P3 : ProcResource<1>; + +// Port 4: Load/store. +def THX2T99P4 : ProcResource<1>; + +// Port 5: Load/store. +def THX2T99P5 : ProcResource<1>; + +let SchedModel = ThunderX2T99Model in { + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes later on. +// +// NOTE: Some groups only contain one member. This is a way to create names for +// the various functional units that share a single issue port. For example, +// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1. + +// Integer divide and multiply micro-ops only on port 1. +def THX2T99I1 : ProcResGroup<[THX2T99P1]>; + +// Branch micro-ops only on port 2. +def THX2T99I2 : ProcResGroup<[THX2T99P2]>; + +// ALU micro-ops on ports 0, 1, and 2. +def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>; + +// Crypto FP/SIMD micro-ops only on port 1. +def THX2T99F1 : ProcResGroup<[THX2T99P1]>; + +// FP/SIMD micro-ops on ports 0 and 1. +def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>; + +// Store data micro-ops only on port 3. +def THX2T99SD : ProcResGroup<[THX2T99P3]>; + +// Load/store micro-ops on ports 4 and 5. +def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>; + +// 60 entry unified scheduler. +def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2, + THX2T99P3, THX2T99P4, THX2T99P5]> { + let BufferSize = 60; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>. + +// 3 cycles on I1. +def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 1 cycles on I2. +def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 4 cycles on I1. +def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 23 cycles on I1. +def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// 39 cycles on I1. +def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} + +// 1 cycle on I0, I1, or I2. +def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on I0, I1, or I2. +def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on I0, I1, or I2. +def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// 5 cycles on I0, I1, or I2. +def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// 5 cycles on F1. +def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 7 cycles on F1. +def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { + let Latency = 7; + let NumMicroOps = 2; +} + +// 4 cycles on F0 or F1. +def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0 or F1. +def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on F0 or F1. +def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on F0 or F1. +def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on F0 or F1. +def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 10 cycles on F0 or F1. +def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 10; + let NumMicroOps = 3; +} + +// 16 cycles on F0 or F1. +def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [8]; +} + +// 23 cycles on F0 or F1. +def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [11]; +} + +// 1 cycles on LS0 or LS1. +def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 0; +} + +// 1 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 0; + let NumMicroOps = 2; +} + +// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_1Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 0; + let NumMicroOps = 3; +} + +// 2 cycles on LS0 or LS1. +def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 4 cycles on LS0 or LS1. +def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 4; + let NumMicroOps = 4; +} + +// 5 cycles on LS0 or LS1. +def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0 or LS1. +def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 4 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_4Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_5Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0 or LS1 and I0, I1, or I2. +def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> { + let Latency = 6; + let NumMicroOps = 4; +} + +// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2. +def THX2T99Write_6Cyc_LS01_I012_I012 : + SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycles on LS0 or LS1 and F0 or F1. +def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0 or LS1 and F0 or F1. +def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0 or LS1 and F0 or F1. +def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on LS0 or LS1 and F0 or F1. +def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on LS0 or LS1 and F0 or F1. +def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; +} + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +let SchedModel = ThunderX2T99Model in { + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes<WriteBr, [THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes<WriteBrReg, [THX2T99I2]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteAtomic, []> { + let Unsupported = 1; + let NumMicroOps = 2; +} + +//--- +// Branch +//--- +def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>; +def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>; +def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>; +def : InstRW<[THX2T99Write_1Cyc_I2], + (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes<WriteI, [THX2T99I012]> { + let Latency = 1; + let ResourceCycles = [1, 3]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes<WriteISReg, [THX2T99I012]> { + let Latency = 2; + let ResourceCycles = [2, 3]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + +def : WriteRes<WriteIEReg, [THX2T99I012]> { + let Latency = 1; + let ResourceCycles = [1, 3]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)", + "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)", + "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)", + "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)", + "CSNEG?(W|X)r(i|r|s|x)")>; + +// Move immed +def : WriteRes<WriteImm, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes<WriteIS, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23/13-39. +def : WriteRes<WriteID32, [THX2T99I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} + +// Divide, X-form +def : WriteRes<WriteID64, [THX2T99I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// Multiply accumulate, W-form +def : WriteRes<WriteIM32, [THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// Multiply accumulate, X-form +def : WriteRes<WriteIM64, [THX2T99I012]> { + let Latency = 5; + let NumMicroOps = 3; +} + +//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012], +// (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[THX2T99Write_5Cyc_I012], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes<WriteExtr, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Multiply high +def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[THX2T99Write_1Cyc_I012], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>; +def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$", + "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>; +def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>; + +// CRC Instructions +// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>; +def : InstRW<[THX2T99Write_4Cyc_I1], + (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>; + +def : InstRW<[THX2T99Write_4Cyc_I1], + (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes<WriteLD, [THX2T99LS01]> { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes<WriteAdr, [THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes<WriteLDHi, []> { + let Latency = 5; + let NumMicroOps = 5; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def THX2T99WriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>, + SchedVar<NoSchedPred, [THX2T99Write_5Cyc_LS01_I012]>]>; +def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>; + +def THX2T99ReadAdrBase : SchedReadVariant<[ + SchedVar<ScaledIdxPred, [ReadDefault]>, + SchedVar<NoSchedPred, [ReadDefault]>]>; +def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>; +def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>; +def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPXpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>; +def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRDroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHHroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRQroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRWroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRXroW)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRBroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRDroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRHHroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRQroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRSHXroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRWroX)>; +def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase], + (instrs LDRXroX)>; + +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>; +def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>; + +//--- +// Prefetch +//--- +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>; +def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes<WriteST, [THX2T99LS01, THX2T99SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> { + let Latency = 1; + let NumMicroOps = 3; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes<WriteSTP, [THX2T99LS01, THX2T99SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. + +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>; +def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>; + +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>; +def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase], + (instrs STRXroW, STRXroX)>; +def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes<WriteF, [THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP arithmetic +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes<WriteFCmp, [THX2T99F01]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP Mul, Div, Sqrt +def : WriteRes<WriteFDiv, [THX2T99F01]> { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes<WriteFMul, [THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[THX2T99XWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes<WriteFCvt, [THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// FP move, immed +// FP move, register +def : WriteRes<WriteFImm, [THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes<WriteFCopy, [THX2T99F01]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes<WriteV, [THX2T99F01]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [4, 23]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MOV, MVN, ORN, ORR) +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv", + "^ORRv", "^ORNv", "^NOTv")>; +// ASIMD arith, reduce +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[THX2T99Write_10Cyc_F01], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^(P?MUL|SQR?DMULH)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD multiply accumulate, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv", + "SQSHRNv","SQSHRUNv", "UQRSHRNv", + "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv", + "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUBHNv", "^SUQADD", + "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv", + "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>; +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP round, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP convert, long and narrow +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>; +def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>; +def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[THX2T99Write_6Cyc_F01], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[THX2T99Write_7Cyc_F01], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; + +// ASIMD move, FP immed +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>; + +// ASIMD table lookup, D-form +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>; + +// ASIMD table lookup, Q-form +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>; + +// ASIMD transpose +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[THX2T99Write_5Cyc_F01], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; + +// ASIMD transfer, element to word or word +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", + "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX2T99Write_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX2T99Write_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX2T99Write_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX2T99Write_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[THX2T99Write_5Cyc_LS01_F01], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[THX2T99Write_7Cyc_LS01_F01], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_8Cyc_LS01_F01], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[THX2T99Write_6Cyc_LS01_F01], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX2T99Write_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX2T99Write_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX2T99Write_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX2T99Write_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +} // SchedModel = ThunderX2T99Model + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td deleted file mode 100644 index 35a40c3..0000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td +++ /dev/null @@ -1,852 +0,0 @@ -//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// 1. Introduction -// -// This file defines the machine model for Broadcom Vulcan to support -// instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// 2. Pipeline Description. - -def VulcanModel : SchedMachineModel { - let IssueWidth = 4; // 4 micro-ops dispatched at a time. - let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. - let LoadLatency = 4; // Optimistic load latency. - let MispredictPenalty = 12; // Extra cycles for mispredicted branch. - // Determined via a mix of micro-arch details and experimentation. - let LoopMicroOpBufferSize = 32; - let PostRAScheduler = 1; // Using PostRA sched. - let CompleteModel = 1; -} - -// Define the issue ports. - -// Port 0: ALU, FP/SIMD. -def VulcanP0 : ProcResource<1>; - -// Port 1: ALU, FP/SIMD, integer mul/div. -def VulcanP1 : ProcResource<1>; - -// Port 2: ALU, Branch. -def VulcanP2 : ProcResource<1>; - -// Port 3: Store data. -def VulcanP3 : ProcResource<1>; - -// Port 4: Load/store. -def VulcanP4 : ProcResource<1>; - -// Port 5: Load/store. -def VulcanP5 : ProcResource<1>; - -let SchedModel = VulcanModel in { - -// Define groups for the functional units on each issue port. Each group -// created will be used by a WriteRes later on. -// -// NOTE: Some groups only contain one member. This is a way to create names for -// the various functional units that share a single issue port. For example, -// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1. - -// Integer divide and multiply micro-ops only on port 1. -def VulcanI1 : ProcResGroup<[VulcanP1]>; - -// Branch micro-ops only on port 2. -def VulcanI2 : ProcResGroup<[VulcanP2]>; - -// ALU micro-ops on ports 0, 1, and 2. -def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>; - -// Crypto FP/SIMD micro-ops only on port 1. -def VulcanF1 : ProcResGroup<[VulcanP1]>; - -// FP/SIMD micro-ops on ports 0 and 1. -def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>; - -// Store data micro-ops only on port 3. -def VulcanSD : ProcResGroup<[VulcanP3]>; - -// Load/store micro-ops on ports 4 and 5. -def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>; - -// 60 entry unified scheduler. -def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2, - VulcanP3, VulcanP4, VulcanP5]> { - let BufferSize=60; -} - -// Define commonly used write types for InstRW specializations. -// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>. - -// 3 cycles on I1. -def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; } - -// 4 cycles on I1. -def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; } - -// 1 cycle on I0, I1, or I2. -def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; } - -// 5 cycles on F1. -def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; } - -// 7 cycles on F1. -def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; } - -// 4 cycles on F0 or F1. -def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; } - -// 5 cycles on F0 or F1. -def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; } - -// 6 cycles on F0 or F1. -def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; } - -// 7 cycles on F0 or F1. -def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; } - -// 8 cycles on F0 or F1. -def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; } - -// 16 cycles on F0 or F1. -def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> { - let Latency = 16; - let ResourceCycles = [8]; -} - -// 23 cycles on F0 or F1. -def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> { - let Latency = 23; - let ResourceCycles = [11]; -} - -// 1 cycles on LS0 or LS1. -def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; } - -// 4 cycles on LS0 or LS1. -def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; } - -// 5 cycles on LS0 or LS1. -def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; } - -// 6 cycles on LS0 or LS1. -def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; } - -// 5 cycles on LS0 or LS1 and I0, I1, or I2. -def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> { - let Latency = 5; - let NumMicroOps = 2; -} - -// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2. -def VulcanWrite_6Cyc_LS01_I012_I012 : - SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> { - let Latency = 6; - let NumMicroOps = 3; -} - -// 1 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { - let Latency = 1; - let NumMicroOps = 2; -} - -// 5 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { - let Latency = 5; - let NumMicroOps = 2; -} - -// 6 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { - let Latency = 6; - let NumMicroOps = 2; -} - -// 7 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { - let Latency = 7; - let NumMicroOps = 2; -} - -// 8 cycles on LS0 or LS1 and F0 or F1. -def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> { - let Latency = 8; - let NumMicroOps = 2; -} - -// Define commonly used read types. - -// No forwarding is provided for these types. -def : ReadAdvance<ReadI, 0>; -def : ReadAdvance<ReadISReg, 0>; -def : ReadAdvance<ReadIEReg, 0>; -def : ReadAdvance<ReadIM, 0>; -def : ReadAdvance<ReadIMA, 0>; -def : ReadAdvance<ReadID, 0>; -def : ReadAdvance<ReadExtrHi, 0>; -def : ReadAdvance<ReadAdrBase, 0>; -def : ReadAdvance<ReadVLD, 0>; - -} - - -//===----------------------------------------------------------------------===// -// 3. Instruction Tables. - -let SchedModel = VulcanModel in { - -//--- -// 3.1 Branch Instructions -//--- - -// Branch, immed -// Branch and link, immed -// Compare and branch -def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; } - -def : WriteRes<WriteSys, []> { let Latency = 1; } -def : WriteRes<WriteBarrier, []> { let Latency = 1; } -def : WriteRes<WriteHint, []> { let Latency = 1; } - -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } - -// Branch, register -// Branch and link, register != LR -// Branch and link, register = LR -def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; } - -//--- -// 3.2 Arithmetic and Logical Instructions -// 3.3 Move and Shift Instructions -//--- - -// ALU, basic -// Conditional compare -// Conditional select -// Address generation -def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; } -def : InstRW<[WriteI], (instrs COPY)>; - -// ALU, extend and/or shift -def : WriteRes<WriteISReg, [VulcanI012]> { - let Latency = 2; - let ResourceCycles = [2]; -} - -def : WriteRes<WriteIEReg, [VulcanI012]> { - let Latency = 2; - let ResourceCycles = [2]; -} - -// Move immed -def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; } - -// Variable shift -def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; } - -//--- -// 3.4 Divide and Multiply Instructions -//--- - -// Divide, W-form -// Latency range of 13-23. Take the average. -def : WriteRes<WriteID32, [VulcanI1]> { - let Latency = 18; - let ResourceCycles = [18]; -} - -// Divide, X-form -// Latency range of 13-39. Take the average. -def : WriteRes<WriteID64, [VulcanI1]> { - let Latency = 26; - let ResourceCycles = [26]; -} - -// Multiply accumulate, W-form -def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; } - -// Multiply accumulate, X-form -def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; } - -// Bitfield extract, two reg -def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; } - -// Bitfield move, basic -// Bitfield move, insert -// NOTE: Handled by WriteIS. - -// Count leading -def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$", - "^CLZ(W|X)r$")>; - -// Reverse bits/bytes -// NOTE: Handled by WriteI. - -//--- -// 3.6 Load Instructions -// 3.10 FP Load Instructions -//--- - -// Load register, literal -// Load register, unscaled immed -// Load register, immed unprivileged -// Load register, unsigned immed -def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; } - -// Load register, immed post-index -// NOTE: Handled by WriteLD, WriteI. -// Load register, immed pre-index -// NOTE: Handled by WriteLD, WriteAdr. -def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; } - -// Load register offset, basic -// Load register, register offset, scale by 4/8 -// Load register, register offset, scale by 2 -// Load register offset, extend -// Load register, register offset, extend, scale by 4/8 -// Load register, register offset, extend, scale by 2 -def VulcanWriteLDIdx : SchedWriteVariant<[ - SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>, - SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>; -def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>; - -def VulcanReadAdrBase : SchedReadVariant<[ - SchedVar<ScaledIdxPred, [ReadDefault]>, - SchedVar<NoSchedPred, [ReadDefault]>]>; -def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>; - -// Load pair, immed offset, normal -// Load pair, immed offset, signed words, base != SP -// Load pair, immed offset signed words, base = SP -// LDP only breaks into *one* LS micro-op. Thus -// the resources are handling by WriteLD. -def : WriteRes<WriteLDHi, []> { - let Latency = 5; -} - -// Load pair, immed pre-index, normal -// Load pair, immed pre-index, signed words -// Load pair, immed post-index, normal -// Load pair, immed post-index, signed words -// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. - -//-- -// 3.7 Store Instructions -// 3.11 FP Store Instructions -//-- - -// Store register, unscaled immed -// Store register, immed unprivileged -// Store register, unsigned immed -def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> { - let Latency = 1; - let NumMicroOps = 2; -} - -// Store register, immed post-index -// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase - -// Store register, immed pre-index -// NOTE: Handled by WriteAdr, WriteST - -// Store register, register offset, basic -// Store register, register offset, scaled by 4/8 -// Store register, register offset, scaled by 2 -// Store register, register offset, extend -// Store register, register offset, extend, scale by 4/8 -// Store register, register offset, extend, scale by 1 -def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> { - let Latency = 1; - let NumMicroOps = 3; -} - -// Store pair, immed offset, W-form -// Store pair, immed offset, X-form -def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> { - let Latency = 1; - let NumMicroOps = 2; -} - -// Store pair, immed post-index, W-form -// Store pair, immed post-index, X-form -// Store pair, immed pre-index, W-form -// Store pair, immed pre-index, X-form -// NOTE: Handled by WriteAdr, WriteSTP. - -//--- -// 3.8 FP Data Processing Instructions -//--- - -// FP absolute value -// FP min/max -// FP negate -def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; } - -// FP arithmetic -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>; - -// FP compare -def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; } - -// FP divide, S-form -// FP square root, S-form -def : WriteRes<WriteFDiv, [VulcanF01]> { - let Latency = 16; - let ResourceCycles = [8]; -} - -// FP divide, D-form -// FP square root, D-form -def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>; - -// FP multiply -// FP multiply accumulate -def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; } - -// FP round to integral -def : InstRW<[VulcanWrite_7Cyc_F01], - (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; - -// FP select -def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>; - -//--- -// 3.9 FP Miscellaneous Instructions -//--- - -// FP convert, from vec to vec reg -// FP convert, from gen to vec reg -// FP convert, from vec to gen reg -def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; } - -// FP move, immed -// FP move, register -def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; } - -// FP transfer, from gen to vec reg -// FP transfer, from vec to gen reg -def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; } -def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; - -//--- -// 3.12 ASIMD Integer Instructions -//--- - -// ASIMD absolute diff, D-form -// ASIMD absolute diff, Q-form -// ASIMD absolute diff accum, D-form -// ASIMD absolute diff accum, Q-form -// ASIMD absolute diff accum long -// ASIMD absolute diff long -// ASIMD arith, basic -// ASIMD arith, complex -// ASIMD compare -// ASIMD logical (AND, BIC, EOR) -// ASIMD max/min, basic -// ASIMD max/min, reduce, 4H/4S -// ASIMD max/min, reduce, 8B/8H -// ASIMD max/min, reduce, 16B -// ASIMD multiply, D-form -// ASIMD multiply, Q-form -// ASIMD multiply accumulate long -// ASIMD multiply accumulate saturating long -// ASIMD multiply long -// ASIMD pairwise add and accumulate -// ASIMD shift accumulate -// ASIMD shift by immed, basic -// ASIMD shift by immed and insert, basic, D-form -// ASIMD shift by immed and insert, basic, Q-form -// ASIMD shift by immed, complex -// ASIMD shift by register, basic, D-form -// ASIMD shift by register, basic, Q-form -// ASIMD shift by register, complex, D-form -// ASIMD shift by register, complex, Q-form -def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; } - -// ASIMD arith, reduce, 4H/4S -// ASIMD arith, reduce, 8B/8H -// ASIMD arith, reduce, 16B -def : InstRW<[VulcanWrite_5Cyc_F01], - (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; - -// ASIMD logical (MOV, MVN, ORN, ORR) -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>; - -// ASIMD polynomial (8x8) multiply long -def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>; - -//--- -// 3.13 ASIMD Floating-point Instructions -//--- - -// ASIMD FP absolute value -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>; - -// ASIMD FP arith, normal, D-form -// ASIMD FP arith, normal, Q-form -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>; - -// ASIMD FP arith,pairwise, D-form -// ASIMD FP arith, pairwise, Q-form -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>; - -// ASIMD FP compare, D-form -// ASIMD FP compare, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>; -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv", - "^FCMGTv", "^FCMLEv", - "^FCMLTv")>; - -// ASIMD FP convert, long -// ASIMD FP convert, narrow -// ASIMD FP convert, other, D-form -// ASIMD FP convert, other, Q-form -// NOTE: Handled by WriteV. - -// ASIMD FP divide, D-form, F32 -def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>; - -// ASIMD FP divide, Q-form, F32 -def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>; - -// ASIMD FP divide, Q-form, F64 -def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>; - -// ASIMD FP max/min, normal, D-form -// ASIMD FP max/min, normal, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv", - "^FMINv", "^FMINNMv")>; - -// ASIMD FP max/min, pairwise, D-form -// ASIMD FP max/min, pairwise, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv", - "^FMINPv", "^FMINNMPv")>; - -// ASIMD FP max/min, reduce -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv", - "^FMINVv", "^FMINNMVv")>; - -// ASIMD FP multiply, D-form, FZ -// ASIMD FP multiply, D-form, no FZ -// ASIMD FP multiply, Q-form, FZ -// ASIMD FP multiply, Q-form, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>; - -// ASIMD FP multiply accumulate, Dform, FZ -// ASIMD FP multiply accumulate, Dform, no FZ -// ASIMD FP multiply accumulate, Qform, FZ -// ASIMD FP multiply accumulate, Qform, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>; - -// ASIMD FP negate -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>; - -// ASIMD FP round, D-form -// ASIMD FP round, Q-form -// NOTE: Handled by WriteV. - -//-- -// 3.14 ASIMD Miscellaneous Instructions -//-- - -// ASIMD bit reverse -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>; - -// ASIMD bitwise insert, D-form -// ASIMD bitwise insert, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>; - -// ASIMD count, D-form -// ASIMD count, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>; - -// ASIMD duplicate, gen reg -// ASIMD duplicate, element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>; - -// ASIMD extract -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>; - -// ASIMD extract narrow -// ASIMD extract narrow, saturating -// NOTE: Handled by WriteV. - -// ASIMD insert, element to element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; - -// ASIMD move, integer immed -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>; - -// ASIMD move, FP immed -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>; - -// ASIMD reciprocal estimate, D-form -// ASIMD reciprocal estimate, Q-form -def : InstRW<[VulcanWrite_5Cyc_F01], - (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", - "^FRSQRTEv", "^URSQRTEv")>; - -// ASIMD reciprocal step, D-form, FZ -// ASIMD reciprocal step, D-form, no FZ -// ASIMD reciprocal step, Q-form, FZ -// ASIMD reciprocal step, Q-form, no FZ -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>; - -// ASIMD reverse -def : InstRW<[VulcanWrite_5Cyc_F01], - (instregex "^REV16v", "^REV32v", "^REV64v")>; - -// ASIMD table lookup, D-form -// ASIMD table lookup, Q-form -def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>; - -// ASIMD transfer, element to word or word -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>; - -// ASIMD transfer, element to gen reg -def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>; - -// ASIMD transfer gen reg to element -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>; - -// ASIMD transpose -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v", - "^UZP1v", "^UZP2v")>; - -// ASIMD unzip/zip -def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>; - -//-- -// 3.15 ASIMD Load Instructions -//-- - -// ASIMD load, 1 element, multiple, 1 reg, D-form -// ASIMD load, 1 element, multiple, 1 reg, Q-form -def : InstRW<[VulcanWrite_4Cyc_LS01], - (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], - (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 1 element, multiple, 2 reg, D-form -// ASIMD load, 1 element, multiple, 2 reg, Q-form -def : InstRW<[VulcanWrite_4Cyc_LS01], - (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], - (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 1 element, multiple, 3 reg, D-form -// ASIMD load, 1 element, multiple, 3 reg, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01], - (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], - (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 1 element, multiple, 4 reg, D-form -// ASIMD load, 1 element, multiple, 4 reg, Q-form -def : InstRW<[VulcanWrite_6Cyc_LS01], - (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], - (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 1 element, one lane, B/H/S -// ASIMD load, 1 element, one lane, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], - (instregex "^LD1i(8|16|32|64)_POST$")>; - -// ASIMD load, 1 element, all lanes, D-form, B/H/S -// ASIMD load, 1 element, all lanes, D-form, D -// ASIMD load, 1 element, all lanes, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], - (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], - (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 2 element, multiple, D-form, B/H/S -// ASIMD load, 2 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], - (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], - (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 2 element, one lane, B/H -// ASIMD load, 2 element, one lane, S -// ASIMD load, 2 element, one lane, D -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], - (instregex "^LD2i(8|16|32|64)_POST$")>; - -// ASIMD load, 2 element, all lanes, D-form, B/H/S -// ASIMD load, 2 element, all lanes, D-form, D -// ASIMD load, 2 element, all lanes, Q-form -def : InstRW<[VulcanWrite_5Cyc_LS01_F01], - (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], - (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 3 element, multiple, D-form, B/H/S -// ASIMD load, 3 element, multiple, Q-form, B/H/S -// ASIMD load, 3 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_8Cyc_LS01_F01], - (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], - (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 3 element, one lone, B/H -// ASIMD load, 3 element, one lane, S -// ASIMD load, 3 element, one lane, D -def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], - (instregex "^LD3i(8|16|32|64)_POST$")>; - -// ASIMD load, 3 element, all lanes, D-form, B/H/S -// ASIMD load, 3 element, all lanes, D-form, D -// ASIMD load, 3 element, all lanes, Q-form, B/H/S -// ASIMD load, 3 element, all lanes, Q-form, D -def : InstRW<[VulcanWrite_7Cyc_LS01_F01], - (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], - (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 4 element, multiple, D-form, B/H/S -// ASIMD load, 4 element, multiple, Q-form, B/H/S -// ASIMD load, 4 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_8Cyc_LS01_F01], - (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], - (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD load, 4 element, one lane, B/H -// ASIMD load, 4 element, one lane, S -// ASIMD load, 4 element, one lane, D -def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], - (instregex "^LD4i(8|16|32|64)_POST$")>; - -// ASIMD load, 4 element, all lanes, D-form, B/H/S -// ASIMD load, 4 element, all lanes, D-form, D -// ASIMD load, 4 element, all lanes, Q-form, B/H/S -// ASIMD load, 4 element, all lanes, Q-form, D -def : InstRW<[VulcanWrite_6Cyc_LS01_F01], - (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], - (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -//-- -// 3.16 ASIMD Store Instructions -//-- - -// ASIMD store, 1 element, multiple, 1 reg, D-form -// ASIMD store, 1 element, multiple, 1 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], - (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], - (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 2 reg, D-form -// ASIMD store, 1 element, multiple, 2 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], - (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], - (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 3 reg, D-form -// ASIMD store, 1 element, multiple, 3 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], - (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], - (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, multiple, 4 reg, D-form -// ASIMD store, 1 element, multiple, 4 reg, Q-form -def : InstRW<[VulcanWrite_1Cyc_LS01], - (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], - (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 1 element, one lane, B/H/S -// ASIMD store, 1 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], - (instregex "^ST1i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST1i(8|16|32|64)_POST$")>; - -// ASIMD store, 2 element, multiple, D-form, B/H/S -// ASIMD store, 2 element, multiple, Q-form, B/H/S -// ASIMD store, 2 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], - (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 2 element, one lane, B/H/S -// ASIMD store, 2 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], - (instregex "^ST2i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST2i(8|16|32|64)_POST$")>; - -// ASIMD store, 3 element, multiple, D-form, B/H/S -// ASIMD store, 3 element, multiple, Q-form, B/H/S -// ASIMD store, 3 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], - (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 3 element, one lane, B/H -// ASIMD store, 3 element, one lane, S -// ASIMD store, 3 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST3i(8|16|32|64)_POST$")>; - -// ASIMD store, 4 element, multiple, D-form, B/H/S -// ASIMD store, 4 element, multiple, Q-form, B/H/S -// ASIMD store, 4 element, multiple, Q-form, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], - (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; - -// ASIMD store, 4 element, one lane, B/H -// ASIMD store, 4 element, one lane, S -// ASIMD store, 4 element, one lane, D -def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>; -def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], - (instregex "^ST4i(8|16|32|64)_POST$")>; - -//-- -// 3.17 Cryptography Extensions -//-- - -// Crypto AES ops -def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>; - -// Crypto polynomial (64x64) multiply long -def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>; - -// Crypto SHA1 xor ops -// Crypto SHA1 schedule acceleration ops -// Crypto SHA256 schedule acceleration op (1 u-op) -// Crypto SHA256 schedule acceleration op (2 u-ops) -// Crypto SHA256 hash acceleration ops -def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>; - -//-- -// 3.18 CRC -//-- - -// CRC checksum ops -def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>; - -} // SchedModel = VulcanModel diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 66a8f33..7f55073 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( Entry.Node = Size; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args)) - .setDiscardResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), + std::move(Args)) + .setDiscardResult(); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( } bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner( CodeGenOpt::Level OptLevel) const { - if (OptLevel >= CodeGenOpt::Aggressive) - return true; - return false; + return OptLevel >= CodeGenOpt::Aggressive; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 03e0132..ea61124 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -12,8 +12,22 @@ //===----------------------------------------------------------------------===// #include "AArch64Subtarget.h" + +#include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64PBQPRegAlloc.h" +#include "AArch64TargetMachine.h" + +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "AArch64CallLowering.h" +#include "AArch64LegalizerInfo.h" +#include "AArch64RegisterBankInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#endif #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/TargetRegistry.h" @@ -35,6 +49,11 @@ static cl::opt<bool> UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " "an address is ignored"), cl::init(false), cl::Hidden); +static cl::opt<bool> + UseNonLazyBind("aarch64-enable-nonlazybind", + cl::desc("Call nonlazybind functions via direct GOT load"), + cl::init(false), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, StringRef CPUString) { @@ -62,6 +81,7 @@ void AArch64Subtarget::initializeProperties() { break; case CortexA57: MaxInterleaveFactor = 4; + PrefFunctionAlignment = 4; break; case ExynosM1: MaxInterleaveFactor = 4; @@ -71,7 +91,12 @@ void AArch64Subtarget::initializeProperties() { break; case Falkor: MaxInterleaveFactor = 4; - VectorInsertExtractBaseCost = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; + CacheLineSize = 128; + PrefetchDistance = 820; + MinPrefetchStride = 2048; + MaxPrefetchIterationsAhead = 8; break; case Kryo: MaxInterleaveFactor = 4; @@ -80,25 +105,99 @@ void AArch64Subtarget::initializeProperties() { PrefetchDistance = 740; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; - case Vulcan: + case ThunderX2T99: + CacheLineSize = 64; + PrefFunctionAlignment = 3; + PrefLoopAlignment = 2; MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; + break; + case ThunderX: + case ThunderXT88: + case ThunderXT81: + case ThunderXT83: + CacheLineSize = 128; + PrefFunctionAlignment = 3; + PrefLoopAlignment = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case CortexA35: break; case CortexA53: break; - case CortexA72: break; - case CortexA73: break; + case CortexA72: + PrefFunctionAlignment = 4; + break; + case CortexA73: + PrefFunctionAlignment = 4; + break; case Others: break; } } +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { + +struct AArch64GISelActualAccessor : public GISelAccessor { + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; + +} // end anonymous namespace +#endif + AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), + : AArch64GenSubtargetInfo(TT, CPU, FS), + ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), - TLInfo(TM, *this), GISel() {} + TLInfo(TM, *this), GISel() { +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *AArch64GISel = new GISelAccessor(); +#else + AArch64GISelActualAccessor *AArch64GISel = new AArch64GISelActualAccessor(); + AArch64GISel->CallLoweringInfo.reset( + new AArch64CallLowering(*getTargetLowering())); + AArch64GISel->Legalizer.reset(new AArch64LegalizerInfo()); + + auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); + + // FIXME: At this point, we can't rely on Subtarget having RBI. + // It's awkward to mix passing RBI and the Subtarget; should we pass + // TII/TRI as well? + AArch64GISel->InstSelector.reset(createAArch64InstructionSelector( + *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); + + AArch64GISel->RegBankInfo.reset(RBI); +#endif + setGISelAccessor(*AArch64GISel); +} const CallLowering *AArch64Subtarget::getCallLowering() const { assert(GISel && "Access to GlobalISel APIs not set"); @@ -133,9 +232,26 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) return AArch64II::MO_GOT; - // The small code mode's direct accesses use ADRP, which cannot necessarily - // produce the value 0 (if the code is above 4GB). - if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) + // The small code model's direct accesses use ADRP, which cannot + // necessarily produce the value 0 (if the code is above 4GB). + if (useSmallAddressing() && GV->hasExternalWeakLinkage()) + return AArch64II::MO_GOT; + + return AArch64II::MO_NO_FLAG; +} + +unsigned char AArch64Subtarget::classifyGlobalFunctionReference( + const GlobalValue *GV, const TargetMachine &TM) const { + // MachO large model always goes via a GOT, because we don't have the + // relocations available to do anything else.. + if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && + !GV->hasInternalLinkage()) + return AArch64II::MO_GOT; + + // NonLazyBind goes via GOT unless we know it's available locally. + auto *F = dyn_cast<Function>(GV); + if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && + !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) return AArch64II::MO_GOT; return AArch64II::MO_NO_FLAG; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index a993402..5a1f45e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -45,7 +45,11 @@ public: ExynosM1, Falkor, Kryo, - Vulcan + ThunderX2T99, + ThunderX, + ThunderXT81, + ThunderXT83, + ThunderXT88 }; protected: @@ -61,9 +65,12 @@ protected: bool HasCRC = false; bool HasLSE = false; bool HasRAS = false; + bool HasRDM = false; bool HasPerfMon = false; bool HasFullFP16 = false; bool HasSPE = false; + bool HasLSLFast = false; + bool HasSVE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -73,6 +80,13 @@ protected: // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign = false; + + // NegativeImmediates - transform instructions with negative immediates + bool NegativeImmediates = true; + + // Enable 64-bit vectorization in SLP. + unsigned MinVectorRegisterBitWidth = 64; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -83,6 +97,8 @@ protected: bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; + bool HasFuseAES = false; + bool HasFuseLiterals = false; bool DisableLatencySchedHeuristic = false; bool UseRSqrt = false; uint8_t MaxInterleaveFactor = 2; @@ -94,6 +110,7 @@ protected: unsigned PrefFunctionAlignment = 0; unsigned PrefLoopAlignment = 0; unsigned MaxJumpTableSize = 0; + unsigned WideningBaseCost = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -176,6 +193,10 @@ public: bool isXRaySupported() const override { return true; } + unsigned getMinVectorRegisterBitWidth() const { + return MinVectorRegisterBitWidth; + } + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } @@ -183,6 +204,7 @@ public: bool hasCRC() const { return HasCRC; } bool hasLSE() const { return HasLSE; } bool hasRAS() const { return HasRAS; } + bool hasRDM() const { return HasRDM; } bool balanceFPOps() const { return BalanceFPOps; } bool predictableSelectIsExpensive() const { return PredictableSelectIsExpensive; @@ -195,6 +217,15 @@ public: } bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } + bool hasFuseAES() const { return HasFuseAES; } + bool hasFuseLiterals() const { return HasFuseLiterals; } + + /// \brief Return true if the CPU supports any kind of instruction fusion. + bool hasFusion() const { + return hasArithmeticBccFusion() || hasArithmeticCbzFusion() || + hasFuseAES() || hasFuseLiterals(); + } + bool useRSqrt() const { return UseRSqrt; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { @@ -211,6 +242,8 @@ public: unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } + unsigned getWideningBaseCost() const { return WideningBaseCost; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; @@ -218,6 +251,8 @@ public: bool hasPerfMon() const { return HasPerfMon; } bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } + bool hasLSLFast() const { return HasLSLFast; } + bool hasSVE() const { return HasSVE; } bool isLittleEndian() const { return IsLittle; } @@ -226,6 +261,7 @@ public: bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } bool isTargetAndroid() const { return TargetTriple.isAndroid(); } + bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } @@ -233,9 +269,17 @@ public: bool useAA() const override { return UseAA; } - /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size - /// that still makes it profitable to inline the call. - unsigned getMaxInlineSizeThreshold() const { return 64; } + bool useSmallAddressing() const { + switch (TLInfo.getTargetMachine().getCodeModel()) { + case CodeModel::Kernel: + // Kernel is currently allowed only for Fuchsia targets, + // where it is the same as Small for almost all purposes. + case CodeModel::Small: + return true; + default: + return false; + } + } /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. @@ -246,6 +290,9 @@ public: unsigned char ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const; + unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, + const TargetMachine &TM) const; + /// This function returns the name of a function which has an interface /// like the non-standard bzero function, if such a function exists on /// the current subtarget and it is considered prefereable over @@ -259,6 +306,17 @@ public: bool enableEarlyIfConversion() const override; std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; + + bool isCallingConvWin64(CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + return isTargetWindows(); + case CallingConv::Win64: + return true; + default: + return false; + } + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td index a3736c0..7c5dcb0 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td" // AT (address translate) instruction options. //===----------------------------------------------------------------------===// -class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; + code Requires = [{ {} }]; } -def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>; -def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>; -def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>; -def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>; -def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>; -def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>; -def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>; -def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>; -def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>; -def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>; -def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>; -def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>; -def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>; -def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>; - +def : AT<"S1E1R", 0b000, 0b0111, 0b1000, 0b000>; +def : AT<"S1E2R", 0b100, 0b0111, 0b1000, 0b000>; +def : AT<"S1E3R", 0b110, 0b0111, 0b1000, 0b000>; +def : AT<"S1E1W", 0b000, 0b0111, 0b1000, 0b001>; +def : AT<"S1E2W", 0b100, 0b0111, 0b1000, 0b001>; +def : AT<"S1E3W", 0b110, 0b0111, 0b1000, 0b001>; +def : AT<"S1E0R", 0b000, 0b0111, 0b1000, 0b010>; +def : AT<"S1E0W", 0b000, 0b0111, 0b1000, 0b011>; +def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>; +def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>; +def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>; +def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>; + +let Requires = [{ {AArch64::HasV8_2aOps} }] in { +def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>; +def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>; +} //===----------------------------------------------------------------------===// // DMB/DSB (data barrier) instruction options. @@ -77,28 +79,31 @@ def : DB<"sy", 0xf>; // DC (data cache maintenance) instruction options. //===----------------------------------------------------------------------===// -class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; let Encoding{2-0} = op2; + code Requires = [{ {} }]; } -def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>; -def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>; -def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>; -def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>; -def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>; -def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>; -def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>; -def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>; +def : DC<"ZVA", 0b011, 0b0111, 0b0100, 0b001>; +def : DC<"IVAC", 0b000, 0b0111, 0b0110, 0b001>; +def : DC<"ISW", 0b000, 0b0111, 0b0110, 0b010>; +def : DC<"CVAC", 0b011, 0b0111, 0b1010, 0b001>; +def : DC<"CSW", 0b000, 0b0111, 0b1010, 0b010>; +def : DC<"CVAU", 0b011, 0b0111, 0b1011, 0b001>; +def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>; +def : DC<"CISW", 0b000, 0b0111, 0b1110, 0b010>; + +let Requires = [{ {AArch64::HasV8_2aOps} }] in +def : DC<"CVAP", 0b011, 0b0111, 0b1100, 0b001>; //===----------------------------------------------------------------------===// // IC (instruction cache maintenance) instruction options. @@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>; def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>; -def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>; +def : IC<"IVAU", 0b011, 0b0111, 0b0101, 0b001, 1>; //===----------------------------------------------------------------------===// // ISB (instruction-fetch barrier) instruction options. @@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>; // TLBI (translation lookaside buffer invalidate) instruction options. //===----------------------------------------------------------------------===// -class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, +class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg = 1> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; string Name = name; - bits<16> Encoding; - let Encoding{15-14} = op0; + bits<14> Encoding; let Encoding{13-11} = op1; let Encoding{10-7} = crn; let Encoding{6-3} = crm; @@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm, bit NeedsReg = needsreg; } -def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>; -def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>; -def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>; -def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>; -def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>; -def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>; -def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>; -def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>; -def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>; -def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>; -def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>; -def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>; -def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>; -def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>; -def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>; -def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>; -def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>; -def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>; +def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; +def : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; +def : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; +def : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; +def : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; +def : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; +def : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; +def : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; +def : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; +def : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; +def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; +def : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; +def : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; +def : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; +def : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; +def : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; +def : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; +def : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; +def : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; +def : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; +def : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; +def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; +def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; +def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index d288394..ba28c01 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -10,23 +10,20 @@ // //===----------------------------------------------------------------------===// +#include "AArch64TargetMachine.h" #include "AArch64.h" -#include "AArch64CallLowering.h" -#include "AArch64InstructionSelector.h" -#include "AArch64LegalizerInfo.h" -#include "AArch64RegisterBankInfo.h" +#include "AArch64MacroFusion.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" @@ -50,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp", cl::desc("Enable the CCMP formation pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableCondBrTuning("aarch64-enable-cond-br-tune", + cl::desc("Enable the conditional branch tuning pass"), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnableMCR("aarch64-enable-mcr", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); @@ -113,11 +115,6 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden, cl::init(false)); static cl::opt<bool> - EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden, - cl::desc("Enable the type promotion pass"), - cl::init(true)); - -static cl::opt<bool> EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), cl::init(false)); @@ -136,6 +133,14 @@ static cl::opt<bool> cl::desc("Enable the loop data prefetch pass"), cl::init(true)); +static cl::opt<int> EnableGlobalISelAtO( + "aarch64-enable-global-isel-at-O", cl::Hidden, + cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), + cl::init(-1)); + +static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); @@ -145,7 +150,6 @@ extern "C" void LLVMInitializeAArch64Target() { initializeGlobalISel(*PR); initializeAArch64A53Fix835769Pass(*PR); initializeAArch64A57FPLoadBalancingPass(*PR); - initializeAArch64AddressTypePromotionPass(*PR); initializeAArch64AdvSIMDScalarPass(*PR); initializeAArch64CollectLOHPass(*PR); initializeAArch64ConditionalComparesPass(*PR); @@ -157,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() { initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); + initializeFalkorHWPFFixPass(*PR); + initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); } @@ -166,6 +172,8 @@ extern "C" void LLVMInitializeAArch64Target() { static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) return llvm::make_unique<AArch64_MachoTargetObjectFile>(); + if (TT.isOSBinFormatCOFF()) + return llvm::make_unique<AArch64_COFFTargetObjectFile>(); return llvm::make_unique<AArch64_ELFTargetObjectFile>(); } @@ -178,6 +186,8 @@ static std::string computeDataLayout(const Triple &TT, return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128"; if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; + if (TT.isOSBinFormatCOFF()) + return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; if (LittleEndian) return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; @@ -215,35 +225,6 @@ AArch64TargetMachine::AArch64TargetMachine( AArch64TargetMachine::~AArch64TargetMachine() = default; -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct AArch64GISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - const AArch64Subtarget * AArch64TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -264,25 +245,6 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { resetTargetOptions(F); I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle); -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - AArch64GISelActualAccessor *GISel = - new AArch64GISelActualAccessor(); - GISel->CallLoweringInfo.reset( - new AArch64CallLowering(*I->getTargetLowering())); - GISel->Legalizer.reset(new AArch64LegalizerInfo()); - - auto *RBI = new AArch64RegisterBankInfo(*I->getRegisterInfo()); - - // FIXME: At this point, we can't rely on Subtarget having RBI. - // It's awkward to mix passing RBI and the Subtarget; should we pass - // TII/TRI as well? - GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI)); - - GISel->RegBankInfo.reset(RBI); -#endif - I->setGISelAccessor(*GISel); } return I.get(); } @@ -308,9 +270,9 @@ namespace { /// AArch64 Code Generator Pass Configuration Options. class AArch64PassConfig : public TargetPassConfig { public: - AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) { - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM.getOptLevel() != CodeGenOpt::None) substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); } @@ -320,13 +282,29 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { + const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + if (ST.hasFusion()) + DAG->addMutation(createAArch64MacroFusionDAGMutation()); return DAG; } + ScheduleDAGInstrs * + createPostMachineScheduler(MachineSchedContext *C) const override { + const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); + if (ST.hasFusion()) { + // Run the Macro Fusion after RA again since literals are expanded from + // pseudos then (v. addPreSched2()). + ScheduleDAGMI *DAG = createGenericSchedPostRA(C); + DAG->addMutation(createAArch64MacroFusionDAGMutation()); + return DAG; + } + + return nullptr; + } + void addIRPasses() override; bool addPreISel() override; bool addInstSelector() override; @@ -334,6 +312,7 @@ public: bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; + void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; #endif bool addILPOpts() override; @@ -341,6 +320,8 @@ public: void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + + bool isGlobalISelEnabled() const override; }; } // end anonymous namespace @@ -352,13 +333,13 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { - return new AArch64PassConfig(this, PM); + return new AArch64PassConfig(*this, PM); } void AArch64PassConfig::addIRPasses() { // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg // ourselves. - addPass(createAtomicExpandPass(TM)); + addPass(createAtomicExpandPass()); // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in @@ -370,14 +351,18 @@ void AArch64PassConfig::addIRPasses() { // // Run this before LSR to remove the multiplies involved in computing the // pointer values N iterations ahead. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) - addPass(createLoopDataPrefetchPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorMarkStridedAccessesPass()); + } TargetPassConfig::addIRPasses(); // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createInterleavedAccessPass(TM)); + addPass(createInterleavedAccessPass()); if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { // Call SeparateConstOffsetFromGEP pass to extract constants within indices @@ -410,9 +395,6 @@ bool AArch64PassConfig::addPreISel() { addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize)); } - if (TM->getOptLevel() != CodeGenOpt::None && EnableAddressTypePromotion) - addPass(createAArch64AddressTypePromotionPass()); - return false; } @@ -444,12 +426,22 @@ bool AArch64PassConfig::addRegBankSelect() { return false; } +void AArch64PassConfig::addPreGlobalInstructionSelect() { + // Workaround the deficiency of the fast register allocator. + if (TM->getOptLevel() == CodeGenOpt::None) + addPass(new Localizer()); +} + bool AArch64PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; } #endif +bool AArch64PassConfig::isGlobalISelEnabled() const { + return TM->getOptLevel() <= EnableGlobalISelAtO; +} + bool AArch64PassConfig::addILPOpts() { if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); @@ -457,6 +449,8 @@ bool AArch64PassConfig::addILPOpts() { addPass(createAArch64ConditionalCompares()); if (EnableMCR) addPass(&MachineCombinerID); + if (EnableCondBrTuning) + addPass(createAArch64CondBrTuning()); if (EnableEarlyIfConversion) addPass(&EarlyIfConverterID); if (EnableStPairSuppress) @@ -493,8 +487,12 @@ void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) - addPass(createAArch64LoadStoreOptimizationPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorHWPFFixPass()); + } } void AArch64PassConfig::addPreEmitPass() { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 6fa5e83..85de02e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -21,6 +21,8 @@ namespace llvm { +class AArch64RegisterBankInfo; + class AArch64TargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; @@ -34,6 +36,9 @@ public: ~AArch64TargetMachine() override; const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some, targets is + // deprecated and should not be used. + const AArch64Subtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 8875f9b..4bc2c06 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -9,12 +9,12 @@ #include "AArch64TargetObjectFile.h" #include "AArch64TargetMachine.h" +#include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Mangler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Dwarf.h" using namespace llvm; using namespace dwarf; @@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } + +void AArch64_MachoTargetObjectFile::getNameWithPrefix( + SmallVectorImpl<char> &OutName, const GlobalValue *GV, + const TargetMachine &TM) const { + // AArch64 does not use section-relative relocations so any global symbol must + // be accessed via at least a linker-private symbol. + getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index 05e1dfa..9077eb7 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -40,8 +40,14 @@ public: const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; + + void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV, + const TargetMachine &TM) const override; }; +/// This implementation is used for AArch64 COFF targets. +class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {}; + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b8833e5..a76f080 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,8 +9,8 @@ #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64AddressingModes.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" @@ -20,6 +20,23 @@ using namespace llvm; #define DEBUG_TYPE "aarch64tti" +static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", + cl::init(true), cl::Hidden); + +bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // Inline a callee if its target-features are a subset of the callers + // target-features. + return (CallerBits & CalleeBits) == CalleeBits; +} + /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. @@ -176,10 +193,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, + ArrayRef<const Value *> Args) { + + // A helper that returns a vector type from the given type. The number of + // elements in type Ty determine the vector width. + auto toVectorTy = [&](Type *ArgTy) { + return VectorType::get(ArgTy->getScalarType(), + DstTy->getVectorNumElements()); + }; + + // Exit early if DstTy is not a vector type whose elements are at least + // 16-bits wide. + if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) + return false; + + // Determine if the operation has a widening variant. We consider both the + // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the + // instructions. + // + // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we + // verify that their extending operands are eliminated during code + // generation. + switch (Opcode) { + case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). + case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + break; + default: + return false; + } + + // To be a widening instruction (either the "wide" or "long" versions), the + // second operand must be a sign- or zero extend having a single user. We + // only consider extends having a single user because they may otherwise not + // be eliminated. + if (Args.size() != 2 || + (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) || + !Args[1]->hasOneUse()) + return false; + auto *Extend = cast<CastInst>(Args[1]); + + // Legalize the destination type and ensure it can be used in a widening + // operation. + auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); + if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) + return false; + + // Legalize the source type and ensure it can be used in a widening + // operation. + Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); + if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) + return false; + + // Get the total number of vector elements in the legalized types. + unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements(); + unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements(); + + // Return true if the legalized types have the same number of vector elements + // and the destination element type size is twice that of the source type. + return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; +} + +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // If the cast is observable, and it is used by a widening instruction (e.g., + // uaddl, saddw, etc.), it may be free. + if (I && I->hasOneUse()) { + auto *SingleUser = cast<Instruction>(*I->user_begin()); + SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); + if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { + // If the cast is the second operand, it is free. We will generate either + // a "wide" or "long" version of the widening instruction. + if (I == SingleUser->getOperand(1)) + return 0; + // If the cast is not the second operand, it will be free if it looks the + // same as the second operand. In this case, we will generate a "long" + // version of the widening instruction. + if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) + if (I->getOpcode() == Cast->getOpcode() && + cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) + return 0; + } + } + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -378,6 +480,16 @@ int AArch64TTIImpl::getArithmeticInstrCost( // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), + // add in the widening overhead specified by the sub-target. Since the + // extends feeding widening instructions are performed automatically, they + // aren't present in the generated code and have a zero cost. By adding a + // widening overhead here, we attach the total cost of the combined operation + // to the widening instruction. + int Cost = 0; + if (isWideningInstruction(Ty, Opcode, Args)) + Cost += ST->getWideningBaseCost(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); if (ISD == ISD::SDIV && @@ -387,9 +499,9 @@ int AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -404,8 +516,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( switch (ISD) { default: - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -413,7 +525,7 @@ int AArch64TTIImpl::getArithmeticInstrCost( case ISD::AND: // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. - return 1 * LT.first; + return (Cost + 1) * LT.first; } } @@ -436,7 +548,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, } int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { + Type *CondTy, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // We don't lower some vector selects well that are wider than the register @@ -463,11 +575,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return Entry->Cost; } } - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, - unsigned Alignment, unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + const Instruction *I) { auto LT = TLI->getTypeLegalizationCost(DL, Ty); if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && @@ -505,12 +618,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); - Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); // ldN/stN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) - return Factor; + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one ldN/stN instruction. + if (NumElts % Factor == 0 && + TLI->isLegalInterleavedAccessType(SubVecTy, DL)) + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, @@ -533,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { return ST->getMaxInterleaveFactor(); } -void AArch64TTIImpl::getUnrollingPreferences(Loop *L, +// For Falkor, we want to avoid having too many strided loads in a loop since +// that can exhaust the HW prefetcher resources. We adjust the unroller +// MaxCount preference below to attempt to ensure unrolling doesn't create too +// many strided loads. +static void +getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TargetTransformInfo::UnrollingPreferences &UP) { + enum { MaxStridedLoads = 7 }; + auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { + int StridedLoads = 0; + // FIXME? We could make this more precise by looking at the CFG and + // e.g. not counting loads in each side of an if-then-else diamond. + for (const auto BB : L->blocks()) { + for (auto &I : *BB) { + LoadInst *LMemI = dyn_cast<LoadInst>(&I); + if (!LMemI) + continue; + + Value *PtrValue = LMemI->getPointerOperand(); + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + // FIXME? We could take pairing of unrolled load copies into account + // by looking at the AddRec, but we would probably have to limit this + // to loops with no stores or other memory optimization barriers. + ++StridedLoads; + // We've seen enough strided loads that seeing more won't make a + // difference. + if (StridedLoads > MaxStridedLoads / 2) + return StridedLoads; + } + } + return StridedLoads; + }; + + int StridedLoads = countStridedLoads(L, SE); + DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads + << " strided loads\n"); + // Pick the largest power of 2 unroll count that won't result in too many + // strided loads. + if (StridedLoads) { + UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); + DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount + << '\n'); + } +} + +void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Enable partial unrolling and runtime unrolling. - BaseT::getUnrollingPreferences(L, UP); + BaseT::getUnrollingPreferences(L, SE, UP); // For inner loop, it is more likely to be a hot one, and the runtime check // can be promoted out from LICM pass, so the overhead is less, let's try @@ -546,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; + + if (ST->getProcFamily() == AArch64Subtarget::Falkor && + EnableFalkorHWPFUnrollFix) + getFalkorUnrollingPreferences(L, SE, UP); } Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, @@ -594,8 +765,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_ld4: Info.ReadMem = true; Info.WriteMem = false; - Info.IsSimple = true; - Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(0); break; case Intrinsic::aarch64_neon_st2: @@ -603,8 +772,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_st4: Info.ReadMem = false; Info.WriteMem = true; - Info.IsSimple = true; - Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); break; } @@ -628,6 +795,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, return true; } +/// See if \p I should be considered for address type promotion. We check if \p +/// I is a sext with right type and used in memory accesses. If it used in a +/// "complex" getelementptr, we allow it to be promoted without finding other +/// sext instructions that sign extended the same initial value. A getelementptr +/// is considered as "complex" if it has more than 2 operands. +bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( + const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { + bool Considerable = false; + AllowPromotionWithoutCommonHeader = false; + if (!isa<SExtInst>(&I)) + return false; + Type *ConsideredSExtType = + Type::getInt64Ty(I.getParent()->getParent()->getContext()); + if (I.getType() != ConsideredSExtType) + return false; + // See if the sext is the one with the right type and used in at least one + // GetElementPtrInst. + for (const User *U : I.users()) { + if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { + Considerable = true; + // A getelementptr is considered as "complex" if it has more than 2 + // operands. We will promote a SExt used in such complex GEP as we + // expect some computation to be merged if they are done on 64 bits. + if (GEPInst->getNumOperands() > 2) { + AllowPromotionWithoutCommonHeader = true; + break; + } + } + } + return Considerable; +} + unsigned AArch64TTIImpl::getCacheLineSize() { return ST->getCacheLineSize(); } @@ -643,3 +842,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() { unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { return ST->getMaxPrefetchIterationsAhead(); } + +bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + return false; + case Instruction::Add: + return ScalarBits * Ty->getVectorNumElements() >= 128; + case Instruction::ICmp: + return (ScalarBits < 64) && + (ScalarBits * Ty->getVectorNumElements() >= 128); + case Instruction::FCmp: + return Flags.NoNaN; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 18287ed..31c0373 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -34,10 +34,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { const AArch64Subtarget *ST; const AArch64TargetLowering *TLI; - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const AArch64Subtarget *getST() const { return ST; } const AArch64TargetLowering *getTLI() const { return TLI; } @@ -47,11 +43,17 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; + bool isWideningInstruction(Type *Ty, unsigned Opcode, + ArrayRef<const Value *> Args); + public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; + /// \name Scalar TTI Implementations /// @{ @@ -79,7 +81,7 @@ public: return 31; } - unsigned getRegisterBitWidth(bool Vector) { + unsigned getRegisterBitWidth(bool Vector) const { if (Vector) { if (ST->hasNEON()) return 128; @@ -88,9 +90,14 @@ public: return 64; } + unsigned getMinVectorRegisterBitWidth() { + return ST->getMinVectorRegisterBitWidth(); + } + unsigned getMaxInterleaveFactor(unsigned VF); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index); @@ -107,14 +114,16 @@ public: int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType); @@ -125,6 +134,10 @@ public: ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace); + bool + shouldConsiderAddressTypePromotion(const Instruction &I, + bool &AllowPromotionWithoutCommonHeader); + unsigned getCacheLineSize(); unsigned getPrefetchDistance(); @@ -132,6 +145,13 @@ public: unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } + + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; /// @} }; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp index e3b1d7c..f53af23 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -19,13 +19,27 @@ // is rewritten into // dup v3.4s, v2.s[1] // fmla v0.4s, v1.4s, v3.4s +// //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <map> using namespace llvm; @@ -41,14 +55,15 @@ namespace { struct AArch64VectorByElementOpt : public MachineFunctionPass { static char ID; - AArch64VectorByElementOpt() : MachineFunctionPass(ID) { - initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); - } const TargetInstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; + AArch64VectorByElementOpt() : MachineFunctionPass(ID) { + initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); + } + /// Based only on latency of instructions, determine if it is cost efficient /// to replace the instruction InstDesc by the two instructions InstDescRep1 /// and InstDescRep2. @@ -90,8 +105,10 @@ struct AArch64VectorByElementOpt : public MachineFunctionPass { return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; } }; + char AArch64VectorByElementOpt::ID = 0; -} // namespace + +} // end anonymous namespace INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index b86a283..a79d518 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -15,8 +15,8 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -74,6 +74,7 @@ private: SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); + void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S); AArch64CC::CondCode parseCondCodeString(StringRef Cond); bool parseCondCode(OperandVector &Operands, bool invertCondCode); unsigned matchRegisterNameAlias(StringRef Name, bool isVector); @@ -85,7 +86,7 @@ private: bool parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode); - bool showMatchError(SMLoc Loc, unsigned ErrCode); + bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands); bool parseDirectiveArch(SMLoc L); bool parseDirectiveCPU(SMLoc L); @@ -537,154 +538,15 @@ public: return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; } - bool isImm0_1() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 2); - } - - bool isImm0_7() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 8); - } - - bool isImm1_8() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 9); - } - - bool isImm0_15() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 16); - } - - bool isImm1_16() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 17); - } - - bool isImm0_31() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 32); - } - - bool isImm1_31() const { + template <int N, int M> + bool isImmInRange() const { if (!isImm()) return false; const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); if (!MCE) return false; int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 32); - } - - bool isImm1_32() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 33); - } - - bool isImm0_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 64); - } - - bool isImm1_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 64); - } - - bool isImm1_64() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 65); - } - - bool isImm0_127() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 128); - } - - bool isImm0_255() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 256); - } - - bool isImm0_65535() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 65536); - } - - bool isImm32_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 32 && Val < 64); + return (Val >= N && Val <= M); } bool isLogicalImm32() const { @@ -804,19 +666,8 @@ public: return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue()); } - bool isBranchTarget26() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); - } - - bool isPCRelLabel19() const { + template<int N> + bool isBranchTarget() const { if (!isImm()) return false; const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); @@ -825,19 +676,8 @@ public: int64_t Val = MCE->getValue(); if (Val & 0x3) return false; - return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); - } - - bool isBranchTarget14() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2)); + assert(N > 0 && "Branch target immediate cannot be 0 bits!"); + return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2)); } bool @@ -2260,27 +2100,9 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { bool isNegative = parseOptionalToken(AsmToken::Minus); const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Real)) { - APFloat RealVal(APFloat::IEEEdouble(), Tok.getString()); - if (isNegative) - RealVal.changeSign(); - - uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); - Parser.Lex(); // Eat the token. - // Check for out of range values. As an exception, we let Zero through, - // as we handle that special case in post-processing before matching in - // order to use the zero register for it. - if (Val == -1 && !RealVal.isPosZero()) { - TokError("expected compatible register or floating-point constant"); - return MatchOperand_ParseFail; - } - Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); - return MatchOperand_Success; - } - if (Tok.is(AsmToken::Integer)) { + if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) { int64_t Val; - if (!isNegative && Tok.getString().startswith("0x")) { + if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) { Val = Tok.getIntVal(); if (Val > 255 || Val < 0) { TokError("encoded floating point value out of range"); @@ -2288,10 +2110,24 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { } } else { APFloat RealVal(APFloat::IEEEdouble(), Tok.getString()); + if (isNegative) + RealVal.changeSign(); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - // If we had a '-' in front, toggle the sign bit. - IntVal ^= (uint64_t)isNegative << 63; Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + + // Check for out of range values. As an exception we let Zero through, + // but as tokens instead of an FPImm so that it can be matched by the + // appropriate alias if one exists. + if (RealVal.isPosZero()) { + Parser.Lex(); // Eat the token. + Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext())); + Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext())); + return MatchOperand_Success; + } else if (Val == -1) { + TokError("expected compatible register or floating-point constant"); + return MatchOperand_ParseFail; + } } Parser.Lex(); // Eat the token. Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); @@ -2494,6 +2330,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { return MatchOperand_Success; } +static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) { + if (FBS[AArch64::HasV8_1aOps]) + Str += "ARMv8.1a"; + else if (FBS[AArch64::HasV8_2aOps]) + Str += "ARMv8.2a"; + else + Str += "(unknown)"; +} + +void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands, + SMLoc S) { + const uint16_t Op2 = Encoding & 7; + const uint16_t Cm = (Encoding & 0x78) >> 3; + const uint16_t Cn = (Encoding & 0x780) >> 7; + const uint16_t Op1 = (Encoding & 0x3800) >> 11; + + const MCExpr *Expr = MCConstantExpr::create(Op1, getContext()); + + Operands.push_back( + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); + Operands.push_back( + AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); + Operands.push_back( + AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); + Expr = MCConstantExpr::create(Op2, getContext()); + Operands.push_back( + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); +} + /// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for /// the SYS instruction. Parse them specially so that we create a SYS MCInst. bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, @@ -2510,228 +2375,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, StringRef Op = Tok.getString(); SMLoc S = Tok.getLoc(); - const MCExpr *Expr = nullptr; - -#define SYS_ALIAS(op1, Cn, Cm, op2) \ - do { \ - Expr = MCConstantExpr::create(op1, getContext()); \ - Operands.push_back( \ - AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - Operands.push_back( \ - AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \ - Operands.push_back( \ - AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \ - Expr = MCConstantExpr::create(op2, getContext()); \ - Operands.push_back( \ - AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - } while (false) - if (Mnemonic == "ic") { - if (!Op.compare_lower("ialluis")) { - // SYS #0, C7, C1, #0 - SYS_ALIAS(0, 7, 1, 0); - } else if (!Op.compare_lower("iallu")) { - // SYS #0, C7, C5, #0 - SYS_ALIAS(0, 7, 5, 0); - } else if (!Op.compare_lower("ivau")) { - // SYS #3, C7, C5, #1 - SYS_ALIAS(3, 7, 5, 1); - } else { + const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op); + if (!IC) return TokError("invalid operand for IC instruction"); + else if (!IC->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("IC " + std::string(IC->Name) + " requires "); + setRequiredFeatureString(IC->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(IC->Encoding, Operands, S); } else if (Mnemonic == "dc") { - if (!Op.compare_lower("zva")) { - // SYS #3, C7, C4, #1 - SYS_ALIAS(3, 7, 4, 1); - } else if (!Op.compare_lower("ivac")) { - // SYS #3, C7, C6, #1 - SYS_ALIAS(0, 7, 6, 1); - } else if (!Op.compare_lower("isw")) { - // SYS #0, C7, C6, #2 - SYS_ALIAS(0, 7, 6, 2); - } else if (!Op.compare_lower("cvac")) { - // SYS #3, C7, C10, #1 - SYS_ALIAS(3, 7, 10, 1); - } else if (!Op.compare_lower("csw")) { - // SYS #0, C7, C10, #2 - SYS_ALIAS(0, 7, 10, 2); - } else if (!Op.compare_lower("cvau")) { - // SYS #3, C7, C11, #1 - SYS_ALIAS(3, 7, 11, 1); - } else if (!Op.compare_lower("civac")) { - // SYS #3, C7, C14, #1 - SYS_ALIAS(3, 7, 14, 1); - } else if (!Op.compare_lower("cisw")) { - // SYS #0, C7, C14, #2 - SYS_ALIAS(0, 7, 14, 2); - } else if (!Op.compare_lower("cvap")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #3, C7, C12, #1 - SYS_ALIAS(3, 7, 12, 1); - } else { - return TokError("DC CVAP requires ARMv8.2a"); - } - } else { + const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op); + if (!DC) return TokError("invalid operand for DC instruction"); + else if (!DC->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("DC " + std::string(DC->Name) + " requires "); + setRequiredFeatureString(DC->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(DC->Encoding, Operands, S); } else if (Mnemonic == "at") { - if (!Op.compare_lower("s1e1r")) { - // SYS #0, C7, C8, #0 - SYS_ALIAS(0, 7, 8, 0); - } else if (!Op.compare_lower("s1e2r")) { - // SYS #4, C7, C8, #0 - SYS_ALIAS(4, 7, 8, 0); - } else if (!Op.compare_lower("s1e3r")) { - // SYS #6, C7, C8, #0 - SYS_ALIAS(6, 7, 8, 0); - } else if (!Op.compare_lower("s1e1w")) { - // SYS #0, C7, C8, #1 - SYS_ALIAS(0, 7, 8, 1); - } else if (!Op.compare_lower("s1e2w")) { - // SYS #4, C7, C8, #1 - SYS_ALIAS(4, 7, 8, 1); - } else if (!Op.compare_lower("s1e3w")) { - // SYS #6, C7, C8, #1 - SYS_ALIAS(6, 7, 8, 1); - } else if (!Op.compare_lower("s1e0r")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 2); - } else if (!Op.compare_lower("s1e0w")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 3); - } else if (!Op.compare_lower("s12e1r")) { - // SYS #4, C7, C8, #4 - SYS_ALIAS(4, 7, 8, 4); - } else if (!Op.compare_lower("s12e1w")) { - // SYS #4, C7, C8, #5 - SYS_ALIAS(4, 7, 8, 5); - } else if (!Op.compare_lower("s12e0r")) { - // SYS #4, C7, C8, #6 - SYS_ALIAS(4, 7, 8, 6); - } else if (!Op.compare_lower("s12e0w")) { - // SYS #4, C7, C8, #7 - SYS_ALIAS(4, 7, 8, 7); - } else if (!Op.compare_lower("s1e1rp")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #0, C7, C9, #0 - SYS_ALIAS(0, 7, 9, 0); - } else { - return TokError("AT S1E1RP requires ARMv8.2a"); - } - } else if (!Op.compare_lower("s1e1wp")) { - if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { - // SYS #0, C7, C9, #1 - SYS_ALIAS(0, 7, 9, 1); - } else { - return TokError("AT S1E1WP requires ARMv8.2a"); - } - } else { + const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op); + if (!AT) return TokError("invalid operand for AT instruction"); + else if (!AT->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("AT " + std::string(AT->Name) + " requires "); + setRequiredFeatureString(AT->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(AT->Encoding, Operands, S); } else if (Mnemonic == "tlbi") { - if (!Op.compare_lower("vmalle1is")) { - // SYS #0, C8, C3, #0 - SYS_ALIAS(0, 8, 3, 0); - } else if (!Op.compare_lower("alle2is")) { - // SYS #4, C8, C3, #0 - SYS_ALIAS(4, 8, 3, 0); - } else if (!Op.compare_lower("alle3is")) { - // SYS #6, C8, C3, #0 - SYS_ALIAS(6, 8, 3, 0); - } else if (!Op.compare_lower("vae1is")) { - // SYS #0, C8, C3, #1 - SYS_ALIAS(0, 8, 3, 1); - } else if (!Op.compare_lower("vae2is")) { - // SYS #4, C8, C3, #1 - SYS_ALIAS(4, 8, 3, 1); - } else if (!Op.compare_lower("vae3is")) { - // SYS #6, C8, C3, #1 - SYS_ALIAS(6, 8, 3, 1); - } else if (!Op.compare_lower("aside1is")) { - // SYS #0, C8, C3, #2 - SYS_ALIAS(0, 8, 3, 2); - } else if (!Op.compare_lower("vaae1is")) { - // SYS #0, C8, C3, #3 - SYS_ALIAS(0, 8, 3, 3); - } else if (!Op.compare_lower("alle1is")) { - // SYS #4, C8, C3, #4 - SYS_ALIAS(4, 8, 3, 4); - } else if (!Op.compare_lower("vale1is")) { - // SYS #0, C8, C3, #5 - SYS_ALIAS(0, 8, 3, 5); - } else if (!Op.compare_lower("vaale1is")) { - // SYS #0, C8, C3, #7 - SYS_ALIAS(0, 8, 3, 7); - } else if (!Op.compare_lower("vmalle1")) { - // SYS #0, C8, C7, #0 - SYS_ALIAS(0, 8, 7, 0); - } else if (!Op.compare_lower("alle2")) { - // SYS #4, C8, C7, #0 - SYS_ALIAS(4, 8, 7, 0); - } else if (!Op.compare_lower("vale2is")) { - // SYS #4, C8, C3, #5 - SYS_ALIAS(4, 8, 3, 5); - } else if (!Op.compare_lower("vale3is")) { - // SYS #6, C8, C3, #5 - SYS_ALIAS(6, 8, 3, 5); - } else if (!Op.compare_lower("alle3")) { - // SYS #6, C8, C7, #0 - SYS_ALIAS(6, 8, 7, 0); - } else if (!Op.compare_lower("vae1")) { - // SYS #0, C8, C7, #1 - SYS_ALIAS(0, 8, 7, 1); - } else if (!Op.compare_lower("vae2")) { - // SYS #4, C8, C7, #1 - SYS_ALIAS(4, 8, 7, 1); - } else if (!Op.compare_lower("vae3")) { - // SYS #6, C8, C7, #1 - SYS_ALIAS(6, 8, 7, 1); - } else if (!Op.compare_lower("aside1")) { - // SYS #0, C8, C7, #2 - SYS_ALIAS(0, 8, 7, 2); - } else if (!Op.compare_lower("vaae1")) { - // SYS #0, C8, C7, #3 - SYS_ALIAS(0, 8, 7, 3); - } else if (!Op.compare_lower("alle1")) { - // SYS #4, C8, C7, #4 - SYS_ALIAS(4, 8, 7, 4); - } else if (!Op.compare_lower("vale1")) { - // SYS #0, C8, C7, #5 - SYS_ALIAS(0, 8, 7, 5); - } else if (!Op.compare_lower("vale2")) { - // SYS #4, C8, C7, #5 - SYS_ALIAS(4, 8, 7, 5); - } else if (!Op.compare_lower("vale3")) { - // SYS #6, C8, C7, #5 - SYS_ALIAS(6, 8, 7, 5); - } else if (!Op.compare_lower("vaale1")) { - // SYS #0, C8, C7, #7 - SYS_ALIAS(0, 8, 7, 7); - } else if (!Op.compare_lower("ipas2e1")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 4, 1); - } else if (!Op.compare_lower("ipas2le1")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 4, 5); - } else if (!Op.compare_lower("ipas2e1is")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 0, 1); - } else if (!Op.compare_lower("ipas2le1is")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 0, 5); - } else if (!Op.compare_lower("vmalls12e1")) { - // SYS #4, C8, C7, #6 - SYS_ALIAS(4, 8, 7, 6); - } else if (!Op.compare_lower("vmalls12e1is")) { - // SYS #4, C8, C3, #6 - SYS_ALIAS(4, 8, 3, 6); - } else { + const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op); + if (!TLBI) return TokError("invalid operand for TLBI instruction"); + else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) { + std::string Str("TLBI " + std::string(TLBI->Name) + " requires "); + setRequiredFeatureString(TLBI->getRequiredFeatures(), Str); + return TokError(Str.c_str()); } + createSysAlias(TLBI->Encoding, Operands, S); } -#undef SYS_ALIAS - Parser.Lex(); // Eat operand. bool ExpectRegister = (Op.lower().find("all") == StringRef::npos); @@ -2744,12 +2429,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, HasRegister = true; } - if (ExpectRegister && !HasRegister) { + if (ExpectRegister && !HasRegister) return TokError("specified " + Mnemonic + " op requires a register"); - } - else if (!ExpectRegister && HasRegister) { + else if (!ExpectRegister && HasRegister) return TokError("specified " + Mnemonic + " op does not use a register"); - } if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list")) return true; @@ -2790,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { return MatchOperand_ParseFail; } - auto DB = AArch64DB::lookupDBByName(Tok.getString()); - if (!DB) { - TokError("invalid barrier option name"); - return MatchOperand_ParseFail; - } - // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) { + auto DB = AArch64DB::lookupDBByName(Tok.getString()); + if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) { TokError("'sy' or #imm operand expected"); return MatchOperand_ParseFail; + } else if (!DB) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; } Operands.push_back(AArch64Operand::CreateBarrier( @@ -2884,7 +2565,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { /// parseRegister - Parse a non-vector register operand. bool AArch64AsmParser::parseRegister(OperandVector &Operands) { - MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); // Try for a vector register. if (!tryParseVectorRegister(Operands)) @@ -2897,30 +2577,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) { Operands.push_back( AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); - // A small number of instructions (FMOVXDhighr, for example) have "[1]" - // as a string token in the instruction itself. - SMLoc LBracS = getLoc(); - const AsmToken &Tok = Parser.getTok(); - if (parseOptionalToken(AsmToken::LBrac)) { - if (Tok.is(AsmToken::Integer)) { - SMLoc IntS = getLoc(); - int64_t Val = Tok.getIntVal(); - if (Val == 1) { - Parser.Lex(); - SMLoc RBracS = getLoc(); - if (parseOptionalToken(AsmToken::RBrac)) { - Operands.push_back( - AArch64Operand::CreateToken("[", false, LBracS, getContext())); - Operands.push_back( - AArch64Operand::CreateToken("1", false, IntS, getContext())); - Operands.push_back( - AArch64Operand::CreateToken("]", false, RBracS, getContext())); - return false; - } - } - } - } - return false; } @@ -3601,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, } } -bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { +std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS); + +bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, + OperandVector &Operands) { switch (ErrCode) { case Match_MissingFeature: return Error(Loc, @@ -3696,6 +3355,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "immediate must be an integer in range [0, 63]."); case Match_InvalidImm0_127: return Error(Loc, "immediate must be an integer in range [0, 127]."); + case Match_InvalidImm0_255: + return Error(Loc, "immediate must be an integer in range [0, 255]."); case Match_InvalidImm0_65535: return Error(Loc, "immediate must be an integer in range [0, 65535]."); case Match_InvalidImm1_8: @@ -3722,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "expected readable system register"); case Match_MSR: return Error(Loc, "expected writable system register or pstate"); - case Match_MnemonicFail: - return Error(Loc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + std::string Suggestion = AArch64MnemonicSpellCheck( + ((AArch64Operand &)*Operands[0]).getToken(), + ComputeAvailableFeatures(STI->getFeatureBits())); + return Error(Loc, "unrecognized instruction mnemonic" + Suggestion); + } default: llvm_unreachable("unexpected error code!"); } @@ -3991,21 +3656,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } } - // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. - if (NumOperands == 3 && Tok == "fmov") { - AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]); - AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); - if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { - unsigned zreg = - !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( - RegOp.getReg()) - ? AArch64::WZR - : AArch64::XZR; - Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(), - Op.getEndLoc(), getContext()); - } - } - MCInst Inst; // First try to match against the secondary set of tables containing the // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). @@ -4064,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, Msg); } case Match_MnemonicFail: - return showMatchError(IDLoc, MatchResult); + return showMatchError(IDLoc, MatchResult, Operands); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; @@ -4083,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix()) MatchResult = Match_InvalidSuffix; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } case Match_InvalidMemoryIndexed1: case Match_InvalidMemoryIndexed2: @@ -4120,6 +3770,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidImm0_31: case Match_InvalidImm0_63: case Match_InvalidImm0_127: + case Match_InvalidImm0_255: case Match_InvalidImm0_65535: case Match_InvalidImm1_8: case Match_InvalidImm1_16: @@ -4140,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } } @@ -4260,10 +3911,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { return false; } +static SMLoc incrementLoc(SMLoc L, int Offset) { + return SMLoc::getFromPointer(L.getPointer() + Offset); +} + /// parseDirectiveCPU /// ::= .cpu id bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { - SMLoc CPULoc = getLoc(); + SMLoc CurLoc = getLoc(); StringRef CPU, ExtensionString; std::tie(CPU, ExtensionString) = @@ -4279,15 +3934,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { // FIXME This is using tablegen data, but should be moved to ARMTargetParser // once that is tablegen'ed if (!getSTI().isCPUStringValid(CPU)) { - Error(CPULoc, "unknown CPU name"); + Error(CurLoc, "unknown CPU name"); return false; } MCSubtargetInfo &STI = copySTI(); STI.setDefaultFeatures(CPU, ""); + CurLoc = incrementLoc(CurLoc, CPU.size()); FeatureBitset Features = STI.getFeatureBits(); for (auto Name : RequestedExtensions) { + // Advance source location past '+'. + CurLoc = incrementLoc(CurLoc, 1); + bool EnableFeature = true; if (Name.startswith_lower("no")) { @@ -4295,6 +3954,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { Name = Name.substr(2); } + bool FoundExtension = false; for (const auto &Extension : ExtensionMap) { if (Extension.Name != Name) continue; @@ -4308,9 +3968,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { uint64_t Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); + FoundExtension = true; break; } + + if (!FoundExtension) + Error(CurLoc, "unsupported architectural extension"); + + CurLoc = incrementLoc(CurLoc, Name.size()); } return false; } diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 0d860a7..7870dce 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -756,7 +756,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, // if shift == '11' then ReservedValue() if (shiftHi == 0x3) return Fail; - // Deliberate fallthrough + LLVM_FALLTHROUGH; case AArch64::ANDWrs: case AArch64::ANDSWrs: case AArch64::BICWrs: @@ -780,7 +780,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, // if shift == '11' then ReservedValue() if (shiftHi == 0x3) return Fail; - // Deliberate fallthrough + LLVM_FALLTHROUGH; case AArch64::ANDXrs: case AArch64::ANDSXrs: case AArch64::BICXrs: diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index b4f8520..fc89657 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -16,12 +16,21 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <string> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -267,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, } } + if (Opcode == AArch64::CompilerBarrier) { + O << '\t' << MAI.getCommentString() << " COMPILER BARRIER"; + printAnnotation(O, Annot); + return; + } + if (!printAliasInstr(MI, STI, O)) printInstruction(MI, STI, O); @@ -451,8 +466,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = { { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, - { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, - { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, + { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, + { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, @@ -731,7 +746,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); #endif - const char *Asm = nullptr; const MCOperand &Op1 = MI->getOperand(0); const MCOperand &Cn = MI->getOperand(1); const MCOperand &Cm = MI->getOperand(2); @@ -742,230 +756,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, unsigned CmVal = Cm.getImm(); unsigned Op2Val = Op2.getImm(); + uint16_t Encoding = Op2Val; + Encoding |= CmVal << 3; + Encoding |= CnVal << 7; + Encoding |= Op1Val << 11; + + bool NeedsReg; + std::string Ins; + std::string Name; + if (CnVal == 7) { switch (CmVal) { - default: - break; - + default: return false; // IC aliases - case 1: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tialluis"; - break; - case 5: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tiallu"; - else if (Op1Val == 3 && Op2Val == 1) - Asm = "ic\tivau"; - break; - + case 1: case 5: { + const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding); + if (!IC || !IC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = IC->NeedsReg; + Ins = "ic\t"; + Name = std::string(IC->Name); + } + break; // DC aliases - case 4: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tzva"; - break; - case 6: - if (Op1Val == 0 && Op2Val == 1) - Asm = "dc\tivac"; - if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tisw"; - break; - case 10: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcsw"; - break; - case 11: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvau"; - break; - case 12: - if (Op1Val == 3 && Op2Val == 1 && - (STI.getFeatureBits()[AArch64::HasV8_2aOps])) - Asm = "dc\tcvap"; - break; - case 14: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcivac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcisw"; - break; - + case 4: case 6: case 10: case 11: case 12: case 14: + { + const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding); + if (!DC || !DC->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "dc\t"; + Name = std::string(DC->Name); + } + break; // AT aliases - case 8: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e1r"; break; - case 1: Asm = "at\ts1e1w"; break; - case 2: Asm = "at\ts1e0r"; break; - case 3: Asm = "at\ts1e0w"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e2r"; break; - case 1: Asm = "at\ts1e2w"; break; - case 4: Asm = "at\ts12e1r"; break; - case 5: Asm = "at\ts12e1w"; break; - case 6: Asm = "at\ts12e0r"; break; - case 7: Asm = "at\ts12e0w"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e3r"; break; - case 1: Asm = "at\ts1e3w"; break; - } - break; - } - break; - case 9: - switch (Op1Val) { - default: - break; - case 0: - if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e1rp"; break; - case 1: Asm = "at\ts1e1wp"; break; - } - } - break; - } + case 8: case 9: { + const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding); + if (!AT || !AT->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = true; + Ins = "at\t"; + Name = std::string(AT->Name); + } + break; } } else if (CnVal == 8) { // TLBI aliases - switch (CmVal) { - default: - break; - case 3: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1is"; break; - case 1: Asm = "tlbi\tvae1is"; break; - case 2: Asm = "tlbi\taside1is"; break; - case 3: Asm = "tlbi\tvaae1is"; break; - case 5: Asm = "tlbi\tvale1is"; break; - case 7: Asm = "tlbi\tvaale1is"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2is"; break; - case 1: Asm = "tlbi\tvae2is"; break; - case 4: Asm = "tlbi\talle1is"; break; - case 5: Asm = "tlbi\tvale2is"; break; - case 6: Asm = "tlbi\tvmalls12e1is"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3is"; break; - case 1: Asm = "tlbi\tvae3is"; break; - case 5: Asm = "tlbi\tvale3is"; break; - } - break; - } - break; - case 0: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1is"; break; - case 5: Asm = "tlbi\tipas2le1is"; break; - } - break; - } - break; - case 4: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1"; break; - case 5: Asm = "tlbi\tipas2le1"; break; - } - break; - } - break; - case 7: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1"; break; - case 1: Asm = "tlbi\tvae1"; break; - case 2: Asm = "tlbi\taside1"; break; - case 3: Asm = "tlbi\tvaae1"; break; - case 5: Asm = "tlbi\tvale1"; break; - case 7: Asm = "tlbi\tvaale1"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2"; break; - case 1: Asm = "tlbi\tvae2"; break; - case 4: Asm = "tlbi\talle1"; break; - case 5: Asm = "tlbi\tvale2"; break; - case 6: Asm = "tlbi\tvmalls12e1"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3"; break; - case 1: Asm = "tlbi\tvae3"; break; - case 5: Asm = "tlbi\tvale3"; break; - } - break; - } - break; - } + const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding); + if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits())) + return false; + + NeedsReg = TLBI->NeedsReg; + Ins = "tlbi\t"; + Name = std::string(TLBI->Name); } + else + return false; - if (Asm) { - unsigned Reg = MI->getOperand(4).getReg(); + std::string Str = Ins + Name; + std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower); - O << '\t' << Asm; - if (StringRef(Asm).lower().find("all") == StringRef::npos) - O << ", " << getRegisterName(Reg); - } + O << '\t' << Str; + if (NeedsReg) + O << ", " << getRegisterName(MI->getOperand(4).getReg()); - return Asm != nullptr; + return true; } void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 65dca99..a45258c 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" namespace llvm { @@ -37,9 +38,11 @@ public: unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); + virtual StringRef getRegName(unsigned RegNo) const { return getRegisterName(RegNo); } + static const char *getRegisterName(unsigned RegNo, unsigned AltIdx = AArch64::NoRegAltName); @@ -177,12 +180,15 @@ public: unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O) override; + StringRef getRegName(unsigned RegNo) const override { return getRegisterName(RegNo); } + static const char *getRegisterName(unsigned RegNo, unsigned AltIdx = AArch64::NoRegAltName); }; -} -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 14c0327..2bd0cbf 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -11,8 +11,9 @@ #include "AArch64RegisterInfo.h" #include "MCTargetDesc/AArch64FixupKinds.h" #include "llvm/ADT/Triple.h" -#include "llvm/MC/MCAssembler.h" +#include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -22,7 +23,6 @@ #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" using namespace llvm; namespace { @@ -43,26 +43,25 @@ public: const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { - // This table *must* be in the order that the fixup_* kinds are defined in - // AArch64FixupKinds.h. - // - // Name Offset (bits) Size (bits) Flags - { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, - { "fixup_aarch64_add_imm12", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 }, - { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 }, - { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_movw", 5, 16, 0 }, - { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal }, - { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal }, - { "fixup_aarch64_tlsdesc_call", 0, 0, 0 } - }; + // This table *must* be in the order that the fixup_* kinds are defined + // in AArch64FixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal}, + {"fixup_aarch64_add_imm12", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0}, + {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0}, + {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_movw", 5, 16, 0}, + {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal}, + {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}, + {"fixup_aarch64_tlsdesc_call", 0, 0, 0}}; if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -72,8 +71,9 @@ public: return Infos[Kind - FirstTargetFixupKind]; } - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, MutableArrayRef<char> Data, + uint64_t Value, bool IsResolved) const override; bool mayNeedRelaxation(const MCInst &Inst) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case FK_Data_1: return 1; - case FK_Data_2: case AArch64::fixup_aarch64_movw: + case FK_Data_2: + case FK_SecRel_2: return 2; case AArch64::fixup_aarch64_pcrel_branch14: @@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: case FK_Data_4: + case FK_SecRel_4: return 4; case FK_Data_8: @@ -138,15 +140,15 @@ static unsigned AdrImmBits(unsigned Value) { } static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, - MCContext *Ctx) { + MCContext &Ctx) { unsigned Kind = Fixup.getKind(); int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: - if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 2097151 || SignedValue < -2097152) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return AdrImmBits(Value & 0x1fffffULL); case AArch64::fixup_aarch64_pcrel_adrp_imm21: return AdrImmBits((Value & 0x1fffff000ULL) >> 12); @@ -154,71 +156,72 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case AArch64::fixup_aarch64_pcrel_branch19: // Signed 21-bit immediate if (SignedValue > 2097151 || SignedValue < -2097152) - if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); // Low two bits are not encoded. return (Value >> 2) & 0x7ffff; case AArch64::fixup_aarch64_add_imm12: case AArch64::fixup_aarch64_ldst_imm12_scale1: // Unsigned 12-bit immediate - if (Ctx && Value >= 0x1000) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value >= 0x1000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); return Value; case AArch64::fixup_aarch64_ldst_imm12_scale2: // Unsigned 12-bit immediate which gets multiplied by 2 - if (Ctx && (Value >= 0x2000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x1)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); + if (Value >= 0x2000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x1) + Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned"); return Value >> 1; case AArch64::fixup_aarch64_ldst_imm12_scale4: // Unsigned 12-bit immediate which gets multiplied by 4 - if (Ctx && (Value >= 0x4000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); + if (Value >= 0x4000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned"); return Value >> 2; case AArch64::fixup_aarch64_ldst_imm12_scale8: // Unsigned 12-bit immediate which gets multiplied by 8 - if (Ctx && (Value >= 0x8000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0x7)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); + if (Value >= 0x8000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0x7) + Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned"); return Value >> 3; case AArch64::fixup_aarch64_ldst_imm12_scale16: // Unsigned 12-bit immediate which gets multiplied by 16 - if (Ctx && (Value >= 0x10000)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); - if (Ctx && (Value & 0xf)) - Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); + if (Value >= 0x10000) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + if (Value & 0xf) + Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned"); return Value >> 4; case AArch64::fixup_aarch64_movw: - if (Ctx) - Ctx->reportError(Fixup.getLoc(), - "no resolvable MOVZ/MOVK fixups supported yet"); + Ctx.reportError(Fixup.getLoc(), + "no resolvable MOVZ/MOVK fixups supported yet"); return Value; case AArch64::fixup_aarch64_pcrel_branch14: // Signed 16-bit immediate - if (Ctx && (SignedValue > 32767 || SignedValue < -32768)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 32767 || SignedValue < -32768) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3fff; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: // Signed 28-bit immediate - if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728)) - Ctx->reportError(Fixup.getLoc(), "fixup value out of range"); + if (SignedValue > 134217727 || SignedValue < -134217728) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); // Low two bits are not encoded (4-byte alignment assumed). - if (Ctx && (Value & 0x3)) - Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); + if (Value & 0x3) + Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned"); return (Value >> 2) & 0x3ffffff; case FK_Data_1: case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_SecRel_2: + case FK_SecRel_4: return Value; } } @@ -262,21 +265,23 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con } } -void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { +void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, uint64_t Value, + bool IsResolved) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + MCContext &Ctx = Asm.getContext(); // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup, Value, nullptr); + Value = adjustFixupValue(Fixup, Value, Ctx); // Shift the value into position. Value <<= Info.TargetOffset; unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!"); // Used to point to big endian bytes. unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind()); @@ -290,7 +295,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } else { // Handle as big-endian - assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!"); + assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!"); assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!"); for (unsigned i = 0; i != NumBytes; ++i) { unsigned Idx = FulleSizeInBytes - 1 - i; @@ -521,17 +526,6 @@ public: return CompactUnwindEncoding; } - - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override { - // Try to get the encoded value for the fixup as-if we're mapping it into - // the instruction. This allows adjustFixupValue() to issue a diagnostic - // if the value is invalid. - if (IsResolved) - (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); - } }; } // end anonymous namespace @@ -551,16 +545,13 @@ public: return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32); } - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target) override; }; -void ELFAArch64AsmBackend::processFixupValue( - const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup, - const MCFragment *DF, const MCValue &Target, uint64_t &Value, - bool &IsResolved) { +bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm, + const MCFixup &Fixup, + const MCValue &Target) { // The ADRP instruction adds some multiple of 0x1000 to the current PC & // ~0xfff. This means that the required offset to reach a symbol can vary by // up to one step depending on where the ADRP is in memory. For example: @@ -574,15 +565,22 @@ void ELFAArch64AsmBackend::processFixupValue( // section isn't 0x1000-aligned, we therefore need to delegate this decision // to the linker -- a relocation! if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) - IsResolved = false; + return true; + return false; +} - // Try to get the encoded value for the fixup as-if we're mapping it into - // the instruction. This allows adjustFixupValue() to issue a diagnostic - // if the value is invalid. - if (IsResolved) - (void)adjustFixupValue(Fixup, Value, &Asm.getContext()); } +namespace { +class COFFAArch64AsmBackend : public AArch64AsmBackend { +public: + COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple) + : AArch64AsmBackend(T, /*IsLittleEndian*/true) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAArch64WinCOFFObjectWriter(OS); + } +}; } MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, @@ -593,7 +591,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, if (TheTriple.isOSBinFormatMachO()) return new DarwinAArch64AsmBackend(T, MRI); - assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); + if (TheTriple.isOSBinFormatCOFF()) + return new COFFAArch64AsmBackend(T, TheTriple); + + assert(TheTriple.isOSBinFormatELF() && "Invalid target"); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); bool IsILP32 = Options.getABIName() == "ilp32"; return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index c954c0e..89c3e5b 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -15,11 +15,11 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> #include <cstdint> @@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, /*HasRelocationAddend*/ true), IsILP32(IsILP32) {} -#define R_CLS(rtype) \ - IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype -#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\ - "supported (LP64 eqv: " #lp64rtype ")" +#define R_CLS(rtype) \ + IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype +#define BAD_ILP32_MOV(lp64rtype) \ + "ILP32 absolute MOV relocation not " \ + "supported (LP64 eqv: " #lp64rtype ")" // assumes IsILP32 is true static bool isNonILP32reloc(const MCFixup &Fixup, @@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup, MCContext &Ctx) { if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw) return false; - switch(RefKind) { - case AArch64MCExpr::VK_ABS_G3: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3)); - return true; - case AArch64MCExpr::VK_ABS_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2)); - return true; - case AArch64MCExpr::VK_ABS_G2_S: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_ABS_G2_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_ABS_G1_S: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_ABS_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_DTPREL_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_DTPREL_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_TPREL_G2: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_TPREL_G1_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_GOTTPREL_G1: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1)); - return ELF::R_AARCH64_NONE; - case AArch64MCExpr::VK_GOTTPREL_G0_NC: - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC)); - return ELF::R_AARCH64_NONE; - default: return false; + switch (RefKind) { + case AArch64MCExpr::VK_ABS_G3: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3)); + return true; + case AArch64MCExpr::VK_ABS_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2)); + return true; + case AArch64MCExpr::VK_ABS_G2_S: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2)); + return true; + case AArch64MCExpr::VK_ABS_G2_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC)); + return true; + case AArch64MCExpr::VK_ABS_G1_S: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1)); + return true; + case AArch64MCExpr::VK_ABS_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC)); + return true; + case AArch64MCExpr::VK_DTPREL_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2)); + return true; + case AArch64MCExpr::VK_DTPREL_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC)); + return true; + case AArch64MCExpr::VK_TPREL_G2: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2)); + return true; + case AArch64MCExpr::VK_TPREL_G1_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC)); + return true; + case AArch64MCExpr::VK_GOTTPREL_G1: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1)); + return true; + case AArch64MCExpr::VK_GOTTPREL_G0_NC: + Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC)); + return true; + default: + return false; } return false; } @@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(PREL32); case FK_Data_8: if (IsILP32) { - Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data " + Ctx.reportError(Fixup.getLoc(), + "ILP32 8 byte PC relative data " "relocation not supported (LP64 eqv: PREL64)"); return ELF::R_AARCH64_NONE; } else @@ -141,6 +144,16 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case AArch64::fixup_aarch64_pcrel_adrp_imm21: if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC) return R_CLS(ADR_PREL_PG_HI21); + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 32-bit pcrel ADRP instruction " + "VK_ABS VK_NC"); + return ELF::R_AARCH64_NONE; + } else { + return ELF::R_AARCH64_ADR_PREL_PG_HI21_NC; + } + } if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) return R_CLS(ADR_GOT_PAGE); if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) @@ -168,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, } } else { if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx)) - return ELF::R_AARCH64_NONE; + return ELF::R_AARCH64_NONE; switch ((unsigned)Fixup.getKind()) { case FK_Data_1: Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported"); @@ -179,7 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(ABS32); case FK_Data_8: if (IsILP32) { - Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(ABS64)); + Ctx.reportError(Fixup.getLoc(), + "ILP32 8 byte absolute data " + "relocation not supported (LP64 eqv: ABS64)"); return ELF::R_AARCH64_NONE; } else return ELF::R_AARCH64_ABS64; @@ -197,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (RefKind == AArch64MCExpr::VK_TPREL_LO12) return R_CLS(TLSLE_ADD_TPREL_LO12); if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12) - return R_CLS(TLSDESC_ADD_LO12_NC); + return R_CLS(TLSDESC_ADD_LO12); if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(ADD_ABS_LO12_NC); @@ -245,15 +260,68 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(TLSLE_LDST32_TPREL_LO12); if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return R_CLS(TLSLE_LDST32_TPREL_LO12_NC); + if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) { + if (IsILP32) { + return ELF::R_AARCH64_P32_LD32_GOT_LO12_NC; + } else { + Ctx.reportError(Fixup.getLoc(), + "LP64 4 byte unchecked GOT load/store relocation " + "not supported (ILP32 eqv: LD32_GOT_LO12_NC"); + return ELF::R_AARCH64_NONE; + } + } + if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) { + if (IsILP32) { + Ctx.reportError(Fixup.getLoc(), + "ILP32 4 byte checked GOT load/store relocation " + "not supported (unchecked eqv: LD32_GOT_LO12_NC)"); + } else { + Ctx.reportError(Fixup.getLoc(), + "LP64 4 byte checked GOT load/store relocation " + "not supported (unchecked/ILP32 eqv: " + "LD32_GOT_LO12_NC)"); + } + return ELF::R_AARCH64_NONE; + } + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) { + if (IsILP32) { + return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC; + } else { + Ctx.reportError(Fixup.getLoc(), + "LP64 32-bit load/store " + "relocation not supported (ILP32 eqv: " + "TLSIE_LD32_GOTTPREL_LO12_NC)"); + return ELF::R_AARCH64_NONE; + } + } + if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) { + if (IsILP32) { + return ELF::R_AARCH64_P32_TLSDESC_LD32_LO12; + } else { + Ctx.reportError(Fixup.getLoc(), + "LP64 4 byte TLSDESC load/store relocation " + "not supported (ILP32 eqv: TLSDESC_LD64_LO12)"); + return ELF::R_AARCH64_NONE; + } + } Ctx.reportError(Fixup.getLoc(), - "invalid fixup for 32-bit load/store instruction"); + "invalid fixup for 32-bit load/store instruction " + "fixup_aarch64_ldst_imm12_scale4"); return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale8: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(LDST64_ABS_LO12_NC); - if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) - return R_CLS(LD64_GOT_LO12_NC); + if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) { + if (!IsILP32) { + return ELF::R_AARCH64_LD64_GOT_LO12_NC; + } else { + Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " + "relocation not supported (LP64 eqv: " + "LD64_GOT_LO12_NC)"); + return ELF::R_AARCH64_NONE; + } + } if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) return R_CLS(TLSLD_LDST64_DTPREL_LO12); if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) @@ -262,19 +330,40 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, return R_CLS(TLSLE_LDST64_TPREL_LO12); if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) return R_CLS(TLSLE_LDST64_TPREL_LO12_NC); - if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) - return IsILP32 ? ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC - : ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; - if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC) - return IsILP32 ? ELF::R_AARCH64_P32_TLSDESC_LD32_LO12_NC - : ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; - + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) { + if (!IsILP32) { + return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; + } else { + Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " + "relocation not supported (LP64 eqv: " + "TLSIE_LD64_GOTTPREL_LO12_NC)"); + return ELF::R_AARCH64_NONE; + } + } + if (SymLoc == AArch64MCExpr::VK_TLSDESC) { + if (!IsILP32) { + return ELF::R_AARCH64_TLSDESC_LD64_LO12; + } else { + Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store " + "relocation not supported (LP64 eqv: " + "TLSDESC_LD64_LO12)"); + return ELF::R_AARCH64_NONE; + } + } Ctx.reportError(Fixup.getLoc(), "invalid fixup for 64-bit load/store instruction"); return ELF::R_AARCH64_NONE; case AArch64::fixup_aarch64_ldst_imm12_scale16: if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) return R_CLS(LDST128_ABS_LO12_NC); + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return R_CLS(TLSLD_LDST128_DTPREL_LO12); + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return R_CLS(TLSLD_LDST128_DTPREL_LO12_NC); + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return R_CLS(TLSLE_LDST128_TPREL_LO12); + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return R_CLS(TLSLE_LDST128_TPREL_LO12_NC); Ctx.reportError(Fixup.getLoc(), "invalid fixup for 128-bit load/store instruction"); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 685907a..a0de3c3 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -14,27 +14,25 @@ //===----------------------------------------------------------------------===// #include "AArch64TargetStreamer.h" -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringExtras.h" +#include "AArch64WinCOFFStreamer.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/ErrorHandling.h" +#include "llvm/MC/MCWinCOFFStreamer.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" @@ -106,8 +104,8 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { + void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + bool) override { EmitA64MappingSymbol(); MCELFStreamer::EmitInstruction(Inst, STI); } @@ -180,6 +178,7 @@ private: DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols; ElfMappingSymbol LastEMS; }; + } // end anonymous namespace AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { @@ -191,6 +190,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { } namespace llvm { + MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, @@ -212,6 +212,9 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { const Triple &TT = STI.getTargetTriple(); if (TT.isOSBinFormatELF()) return new AArch64TargetELFStreamer(S); + if (TT.isOSBinFormatCOFF()) + return new AArch64TargetWinCOFFStreamer(S); return nullptr; } -} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h index 0f5b765..4293dcb 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -16,53 +16,47 @@ namespace llvm { namespace AArch64 { enum Fixups { - // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into - // an ADR instruction. + // A 21-bit pc-relative immediate inserted into an ADR instruction. fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind, - // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into - // an ADRP instruction. + // A 21-bit pc-relative immediate inserted into an ADRP instruction. fixup_aarch64_pcrel_adrp_imm21, - // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions. - // No alignment adjustment. All value bits are encoded. + // 12-bit fixup for add/sub instructions. No alignment adjustment. All value + // bits are encoded. fixup_aarch64_add_imm12, - // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and - // store instructions. + // unsigned 12-bit fixups for load and store instructions. fixup_aarch64_ldst_imm12_scale1, fixup_aarch64_ldst_imm12_scale2, fixup_aarch64_ldst_imm12_scale4, fixup_aarch64_ldst_imm12_scale8, fixup_aarch64_ldst_imm12_scale16, - // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by - // pc-relative loads and generates relocations directly when necessary. + // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as + // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and + // generates relocations directly when necessary. fixup_aarch64_ldr_pcrel_imm19, // FIXME: comment fixup_aarch64_movw, - // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative - // immediate. + // The high 14 bits of a 21-bit pc-relative immediate. fixup_aarch64_pcrel_branch14, - // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by - // b.cc and generates relocations directly when necessary. + // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as + // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates + // relocations directly when necessary. fixup_aarch64_pcrel_branch19, - // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative - // immediate. + // The high 26 bits of a 28-bit pc-relative immediate. fixup_aarch64_pcrel_branch26, - // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative - // immediate. Distinguished from branch26 only on ELF. + // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from + // branch26 only on ELF. fixup_aarch64_pcrel_call26, - // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF - // R_AARCH64_TLSDESC_CALL relocation. + // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation. fixup_aarch64_tlsdesc_call, // Marker diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 8fc8223..c25bd8c 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -32,14 +32,15 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant( clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the short, Apple-specific + // form when targeting Darwin. + AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant; PrivateGlobalPrefix = "L"; PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; - PointerSize = CalleeSaveStackSlotSize = 8; + CodePointerSize = CalleeSaveStackSlotSize = 8; AlignmentIsInBytes = false; UsesELFSectionDirectiveForBSS = true; @@ -68,10 +69,11 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { if (T.getArch() == Triple::aarch64_be) IsLittleEndian = false; - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the generic form when + // targeting ELF. + AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant; - PointerSize = 8; + CodePointerSize = 8; // ".comm align is in bytes but .align is pow-2." AlignmentIsInBytes = false; @@ -98,3 +100,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { HasIdentDirective = true; } + +AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { + CommentString = ";"; + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 253cd30..2d7107a 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H +#include "llvm/MC/MCAsmInfoCOFF.h" #include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" @@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF { explicit AArch64MCAsmInfoELF(const Triple &T); }; +struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF { + explicit AArch64MCAsmInfoCOFF(); +}; + } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 62dfa59..33698d2 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call); Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup)); return; + } else if (MI.getOpcode() == AArch64::CompilerBarrier) { + // This just prevents the compiler from reordering accesses, no actual code. + return; } uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index a540f49..97c92fa 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -62,6 +62,7 @@ StringRef AArch64MCExpr::getVariantKindName() const { case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:"; case VK_TLSDESC_LO12: return ":tlsdesc_lo12:"; case VK_ABS_PAGE: return ""; + case VK_ABS_PAGE_NC: return ":pg_hi21_nc:"; case VK_GOT_PAGE: return ":got:"; case VK_GOT_LO12: return ":got_lo12:"; case VK_GOTTPREL_PAGE: return ":gottprel:"; diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index db36a65..3dbf0f8 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -62,6 +62,7 @@ public: // since a user would write ":lo12:"). VK_CALL = VK_ABS, VK_ABS_PAGE = VK_ABS | VK_PAGE, + VK_ABS_PAGE_NC = VK_ABS | VK_PAGE | VK_NC, VK_ABS_G3 = VK_ABS | VK_G3, VK_ABS_G2 = VK_ABS | VK_G2, VK_ABS_G2_S = VK_SABS | VK_G2, @@ -95,7 +96,7 @@ public: VK_TPREL_HI12 = VK_TPREL | VK_HI12, VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF, VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC, - VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC, + VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF, VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE, VK_INVALID = 0xfff diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index e9d38d3..a255549 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -14,6 +14,7 @@ #include "AArch64MCTargetDesc.h" #include "AArch64ELFStreamer.h" #include "AArch64MCAsmInfo.h" +#include "AArch64WinCOFFStreamer.h" #include "InstPrinter/AArch64InstPrinter.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" @@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI; if (TheTriple.isOSBinFormatMachO()) MAI = new AArch64MCAsmInfoDarwin(); + else if (TheTriple.isOSBinFormatCOFF()) + MAI = new AArch64MCAsmInfoCOFF(); else { - assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); + assert(TheTriple.isOSBinFormatELF() && "Invalid target"); MAI = new AArch64MCAsmInfoELF(TheTriple); } @@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, CodeModel::Model &CM) { - assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && - "Only expect Darwin and ELF targets"); + assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() || + TT.isOSBinFormatCOFF()) && "Invalid target"); if (CM == CodeModel::Default) CM = CodeModel::Small; @@ -84,9 +87,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, // no matter how far away they are. else if (CM == CodeModel::JITDefault) CM = CodeModel::Large; - else if (CM != CodeModel::Small && CM != CodeModel::Large) - report_fatal_error( - "Only small and large code models are allowed on AArch64"); + else if (CM != CodeModel::Small && CM != CodeModel::Large) { + if (!TT.isOSFuchsia()) + report_fatal_error( + "Only small and large code models are allowed on AArch64"); + else if (CM != CodeModel::Kernel) + report_fatal_error( + "Only small, kernel, and large code models are allowed on AArch64"); + } } static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, @@ -117,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB, /*LabelSections*/ true); } +static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible) { + return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, + IncrementalLinkerCompatible); +} + static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) { return new MCInstrAnalysis(Info); } @@ -149,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { // Register the obj streamers. TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer); + TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer); // Register the obj target streamer. TargetRegistry::RegisterObjectTargetStreamer( diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index 615d7da..1404926 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS, uint32_t CPUType, uint32_t CPUSubtype); +MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS); + MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint, diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 53a6852..19b2576 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -10,20 +10,28 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFragment.h" #include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> +#include <cstdint> + using namespace llvm; namespace { + class AArch64MachObjectWriter : public MCMachObjectTargetWriter { bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, @@ -38,7 +46,8 @@ public: const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; }; -} + +} // end anonymous namespace bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, @@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return false; case FK_Data_1: - Log2Size = llvm::Log2_32(1); + Log2Size = Log2_32(1); return true; case FK_Data_2: - Log2Size = llvm::Log2_32(2); + Log2Size = Log2_32(2); return true; case FK_Data_4: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case FK_Data_8: - Log2Size = llvm::Log2_32(8); + Log2Size = Log2_32(8); if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; @@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( case AArch64::fixup_aarch64_ldst_imm12_scale4: case AArch64::fixup_aarch64_ldst_imm12_scale8: case AArch64::fixup_aarch64_ldst_imm12_scale16: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); switch (Sym->getKind()) { default: return false; @@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; } case AArch64::fixup_aarch64_pcrel_adrp_imm21: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); // This encompasses the relocation for the whole 21-bit value. switch (Sym->getKind()) { - default: { + default: Asm.getContext().reportError(Fixup.getLoc(), "ADR/ADRP relocations must be GOT relative"); return false; - } case MCSymbolRefExpr::VK_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; @@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: - Log2Size = llvm::Log2_32(4); + Log2Size = Log2_32(4); RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp new file mode 100644 index 0000000..31762b9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -0,0 +1,104 @@ +//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/COFF.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> + +using namespace llvm; + +namespace { + +class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { +public: + AArch64WinCOFFObjectWriter() + : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {} + + ~AArch64WinCOFFObjectWriter() override = default; + + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsCrossSection, + const MCAsmBackend &MAB) const override; + + bool recordRelocation(const MCFixup &) const override; +}; + +} // end anonymous namespace + +unsigned AArch64WinCOFFObjectWriter::getRelocType( + MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, const MCAsmBackend &MAB) const { + auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None + : Target.getSymA()->getKind(); + + switch (static_cast<unsigned>(Fixup.getKind())) { + default: { + const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); + report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); + } + + case FK_Data_4: + switch (Modifier) { + default: + return COFF::IMAGE_REL_ARM64_ADDR32; + case MCSymbolRefExpr::VK_COFF_IMGREL32: + return COFF::IMAGE_REL_ARM64_ADDR32NB; + case MCSymbolRefExpr::VK_SECREL: + return COFF::IMAGE_REL_ARM64_SECREL; + } + + case FK_Data_8: + return COFF::IMAGE_REL_ARM64_ADDR64; + + case FK_SecRel_2: + return COFF::IMAGE_REL_ARM64_SECTION; + + case FK_SecRel_4: + return COFF::IMAGE_REL_ARM64_SECREL; + + case AArch64::fixup_aarch64_add_imm12: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A; + + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L; + + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21; + + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + return COFF::IMAGE_REL_ARM64_BRANCH26; + } +} + +bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { + return true; +} + +namespace llvm { + +MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) { + MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter(); + return createWinCOFFObjectWriter(MOTW, OS); +} + +} // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp new file mode 100644 index 0000000..6c8da27 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp @@ -0,0 +1,37 @@ +//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64WinCOFFStreamer.h" + +using namespace llvm; + +namespace { + +class AArch64WinCOFFStreamer : public MCWinCOFFStreamer { +public: + friend class AArch64TargetWinCOFFStreamer; + + AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE, + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, AB, CE, OS) {} +}; +} // end anonymous namespace + +namespace llvm { +MCWinCOFFStreamer +*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible) { + auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; +} + +} // end llvm namespace diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h new file mode 100644 index 0000000..1b4fcd6 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h @@ -0,0 +1,43 @@ +//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements WinCOFF streamer information for the AArch64 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H + +#include "AArch64TargetStreamer.h" +#include "llvm/MC/MCWinCOFFStreamer.h" + +namespace { +class AArch64WinCOFFStreamer; + +class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer { +private: + AArch64WinCOFFStreamer &getStreamer(); + +public: + AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S) + : AArch64TargetStreamer(S) {} +}; + +} // end anonymous namespace + +namespace llvm { + +MCWinCOFFStreamer +*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible); +} // end llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index dcc3917..5d76681 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { } } // end namespace AArch64CC +struct SysAlias { + const char *Name; + uint16_t Encoding; + FeatureBitset FeaturesRequired; + + SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {}; + SysAlias (const char *N, uint16_t E, FeatureBitset F) : + Name(N), Encoding(E), FeaturesRequired(F) {}; + + bool haveFeatures(FeatureBitset ActiveFeatures) const { + return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; + } + + FeatureBitset getRequiredFeatures() const { return FeaturesRequired; } +}; + +struct SysAliasReg : SysAlias { + bool NeedsReg; + SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {}; +}; + namespace AArch64AT{ - struct AT { - const char *Name; - uint16_t Encoding; + struct AT : SysAlias { + using SysAlias::SysAlias; }; - #define GET_AT_DECL #include "AArch64GenSystemOperands.inc" - } + namespace AArch64DB { - struct DB { - const char *Name; - uint16_t Encoding; + struct DB : SysAlias { + using SysAlias::SysAlias; }; - #define GET_DB_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64DC { - struct DC { - const char *Name; - uint16_t Encoding; + struct DC : SysAlias { + using SysAlias::SysAlias; }; - #define GET_DC_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64IC { - struct IC { - const char *Name; - uint16_t Encoding; - bool NeedsReg; + struct IC : SysAliasReg { + using SysAliasReg::SysAliasReg; }; #define GET_IC_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64ISB { - struct ISB { - const char *Name; - uint16_t Encoding; + struct ISB : SysAlias { + using SysAlias::SysAlias; }; #define GET_ISB_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PRFM { - struct PRFM { - const char *Name; - uint16_t Encoding; + struct PRFM : SysAlias { + using SysAlias::SysAlias; }; #define GET_PRFM_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PState { - struct PState { - const char *Name; - uint16_t Encoding; - FeatureBitset FeaturesRequired; - - bool haveFeatures(FeatureBitset ActiveFeatures) const { - return (FeaturesRequired & ActiveFeatures) == FeaturesRequired; - } + struct PState : SysAlias{ + using SysAlias::SysAlias; }; #define GET_PSTATE_DECL #include "AArch64GenSystemOperands.inc" } namespace AArch64PSBHint { - struct PSB { - const char *Name; - uint16_t Encoding; + struct PSB : SysAlias { + using SysAlias::SysAlias; }; #define GET_PSB_DECL #include "AArch64GenSystemOperands.inc" @@ -451,10 +455,8 @@ namespace AArch64SysReg { } namespace AArch64TLBI { - struct TLBI { - const char *Name; - uint16_t Encoding; - bool NeedsReg; + struct TLBI : SysAliasReg { + using SysAliasReg::SysAliasReg; }; #define GET_TLBI_DECL #include "AArch64GenSystemOperands.inc" |