diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
80 files changed, 75760 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h new file mode 100644 index 0000000..21106c9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64.h @@ -0,0 +1,49 @@ +//==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// AArch64 back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64_H + +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AArch64TargetMachine; +class FunctionPass; +class MachineFunctionPass; + +FunctionPass *createAArch64DeadRegisterDefinitions(); +FunctionPass *createAArch64ConditionalCompares(); +FunctionPass *createAArch64AdvSIMDScalar(); +FunctionPass *createAArch64BranchRelaxation(); +FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, + CodeGenOpt::Level OptLevel); +FunctionPass *createAArch64StorePairSuppressPass(); +FunctionPass *createAArch64ExpandPseudoPass(); +FunctionPass *createAArch64LoadStoreOptimizationPass(); +ModulePass *createAArch64PromoteConstantPass(); +FunctionPass *createAArch64ConditionOptimizerPass(); +FunctionPass *createAArch64AddressTypePromotionPass(); +FunctionPass *createAArch64A57FPLoadBalancing(); +FunctionPass *createAArch64A53Fix835769(); + +FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); + +FunctionPass *createAArch64CollectLOHPass(); +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td new file mode 100644 index 0000000..0bff9b5 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -0,0 +1,183 @@ +//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// AArch64 Subtarget features. +// + +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", + "Enable ARMv8 FP">; + +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; + +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable cryptographic instructions">; + +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable ARMv8 CRC-32 checksum instructions">; + +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable ARMv8 PMUv3 Performance Monitors extension">; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Full FP16", [FeatureFPARMv8]>; + +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + +/// Cyclone has register move instructions which are "free". +def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", + "Has zero-cycle register moves">; + +/// Cyclone has instructions which zero registers for "free". +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", + "Reserve X18, making it unavailable " + "as a GPR">; + +//===----------------------------------------------------------------------===// +// Architectures. +// + +def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", + "Support ARM v8.1a instructions", [FeatureCRC]>; + +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", [HasV8_1aOps]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "AArch64RegisterInfo.td" +include "AArch64CallingConvention.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "AArch64Schedule.td" +include "AArch64InstrInfo.td" + +def AArch64InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// AArch64 Processors supported. +// +include "AArch64SchedA53.td" +include "AArch64SchedA57.td" +include "AArch64SchedCyclone.td" + +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + +def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", + "Cortex-A53 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + +def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + +def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", + "Cyclone", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon, + FeatureZCRegMove, FeatureZCZeroing]>; + +def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, + FeatureNEON, + FeatureCRC, + FeaturePerfMon]>; + +// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; +def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; +// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. +def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; +def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; + +//===----------------------------------------------------------------------===// +// Assembly parser +//===----------------------------------------------------------------------===// + +def GenericAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "generic"; + string BreakCharacters = "."; +} + +def AppleAsmParserVariant : AsmParserVariant { + int Variant = 1; + string Name = "apple-neon"; + string BreakCharacters = "."; +} + +//===----------------------------------------------------------------------===// +// Assembly printer +//===----------------------------------------------------------------------===// +// AArch64 Uses the MC printer for asm output, so make sure the TableGen +// AsmWriter bits get associated with the correct class. +def GenericAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int PassSubtarget = 1; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def AppleAsmWriter : AsmWriter { + let AsmWriterClassName = "AppleInstPrinter"; + int PassSubtarget = 1; + int Variant = 1; + int isMCAsmWriter = 1; +} + +//===----------------------------------------------------------------------===// +// Target Declaration +//===----------------------------------------------------------------------===// + +def AArch64 : Target { + let InstructionSet = AArch64InstrInfo; + let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant]; + let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp new file mode 100644 index 0000000..d215d9e --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -0,0 +1,235 @@ +//===-- AArch64A53Fix835769.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass changes code to work around Cortex-A53 erratum 835769. +// It works around it by inserting a nop instruction in code sequences that +// in some circumstances may trigger the erratum. +// It inserts a nop instruction between a sequence of the following 2 classes +// of instructions: +// instr 1: mem-instr (including loads, stores and prefetches). +// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769" + +STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769"); + +//===----------------------------------------------------------------------===// +// Helper functions + +// Is the instruction a match for the instruction that comes first in the +// sequence of instructions that can trigger the erratum? +static bool isFirstInstructionInSequence(MachineInstr *MI) { + // Must return true if this instruction is a load, a store or a prefetch. + switch (MI->getOpcode()) { + case AArch64::PRFMl: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::PRFMui: + case AArch64::PRFUMi: + return true; + default: + return MI->mayLoadOrStore(); + } +} + +// Is the instruction a match for the instruction that comes second in the +// sequence that can trigger the erratum? +static bool isSecondInstructionInSequence(MachineInstr *MI) { + // Must return true for non-SIMD integer multiply-accumulates, writing + // to a 64-bit register. + switch (MI->getOpcode()) { + // Erratum cannot be triggered when the destination register is 32 bits, + // therefore only include the following. + case AArch64::MSUBXrrr: + case AArch64::MADDXrrr: + case AArch64::SMADDLrrr: + case AArch64::SMSUBLrrr: + case AArch64::UMADDLrrr: + case AArch64::UMSUBLrrr: + // Erratum can only be triggered by multiply-adds, not by regular + // non-accumulating multiplies, i.e. when Ra=XZR='11111' + return MI->getOperand(3).getReg() != AArch64::XZR; + default: + return false; + } +} + + +//===----------------------------------------------------------------------===// + +namespace { +class AArch64A53Fix835769 : public MachineFunctionPass { + const TargetInstrInfo *TII; + +public: + static char ID; + explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "Workaround A53 erratum 835769 pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB); +}; +char AArch64A53Fix835769::ID = 0; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +bool +AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { + DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n"); + bool Changed = false; + TII = F.getSubtarget().getInstrInfo(); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB); + } + return Changed; +} + +// Return the block that was fallen through to get to MBB, if any, +// otherwise nullptr. +static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + // Get the previous machine basic block in the function. + MachineFunction::iterator MBBI(MBB); + + // Can't go off top of function. + if (MBBI == MBB->getParent()->begin()) + return nullptr; + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 2> Cond; + + MachineBasicBlock *PrevBB = &*std::prev(MBBI); + for (MachineBasicBlock *S : MBB->predecessors()) + if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && + !TBB && !FBB) + return S; + + return nullptr; +} + +// Iterate through fallen through blocks trying to find a previous non-pseudo if +// there is one, otherwise return nullptr. Only look for instructions in +// previous blocks, not the current block, since we only use this to look at +// previous blocks. +static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB, + const TargetInstrInfo *TII) { + MachineBasicBlock *FMBB = &MBB; + + // If there is no non-pseudo in the current block, loop back around and try + // the previous block (if there is one). + while ((FMBB = getBBFallenThrough(FMBB, TII))) { + for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend())) + if (!I.isPseudo()) + return &I; + } + + // There was no previous non-pseudo in the fallen through blocks + return nullptr; +} + +static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI, + const TargetInstrInfo *TII) { + // If we are the first instruction of the block, put the NOP at the end of + // the previous fallthrough block + if (MI == &MBB.front()) { + MachineInstr *I = getLastNonPseudo(MBB, TII); + assert(I && "Expected instruction"); + DebugLoc DL = I->getDebugLoc(); + BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0); + } + else { + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0); + } + + ++NumNopsAdded; +} + +bool +AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); + + // First, scan the basic block, looking for a sequence of 2 instructions + // that match the conditions under which the erratum may trigger. + + // List of terminating instructions in matching sequences + std::vector<MachineInstr*> Sequences; + unsigned Idx = 0; + MachineInstr *PrevInstr = nullptr; + + // Try and find the last non-pseudo instruction in any fallen through blocks, + // if there isn't one, then we use nullptr to represent that. + PrevInstr = getLastNonPseudo(MBB, TII); + + for (auto &MI : MBB) { + MachineInstr *CurrInstr = &MI; + DEBUG(dbgs() << " Examining: " << MI); + if (PrevInstr) { + DEBUG(dbgs() << " PrevInstr: " << *PrevInstr + << " CurrInstr: " << *CurrInstr + << " isFirstInstructionInSequence(PrevInstr): " + << isFirstInstructionInSequence(PrevInstr) << "\n" + << " isSecondInstructionInSequence(CurrInstr): " + << isSecondInstructionInSequence(CurrInstr) << "\n"); + if (isFirstInstructionInSequence(PrevInstr) && + isSecondInstructionInSequence(CurrInstr)) { + DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n"); + Sequences.push_back(CurrInstr); + } + } + if (!CurrInstr->isPseudo()) + PrevInstr = CurrInstr; + ++Idx; + } + + DEBUG(dbgs() << "Scan complete, " << Sequences.size() + << " occurrences of pattern found.\n"); + + // Then update the basic block, inserting nops between the detected sequences. + for (auto &MI : Sequences) { + Changed = true; + insertNopBeforeInstruction(MBB, MI, TII); + } + + return Changed; +} + +// Factory function used by AArch64TargetMachine to add the pass to +// the passmanager. +FunctionPass *llvm::createAArch64A53Fix835769() { + return new AArch64A53Fix835769(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp new file mode 100644 index 0000000..79a84ad --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -0,0 +1,735 @@ +//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// For best-case performance on Cortex-A57, we should try to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply or +// multiply-accumulate operations. +// +// This pass attempts to detect situations where the register allocation may +// adversely affect this load balancing and to change the registers used so as +// to better utilize the CPU. +// +// Ideally we'd just take each multiply or multiply-accumulate in turn and +// allocate it alternating even or odd registers. However, multiply-accumulates +// are most efficiently performed in the same functional unit as their +// accumulation operand. Therefore this pass tries to find maximal sequences +// ("Chains") of multiply-accumulates linked via their accumulation operand, +// and assign them all the same "color" (oddness/evenness). +// +// This optimization affects S-register and D-register floating point +// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and +// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are +// not affected. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <list> +using namespace llvm; + +#define DEBUG_TYPE "aarch64-a57-fp-load-balancing" + +// Enforce the algorithm to use the scavenged register even when the original +// destination register is the correct color. Used for testing. +static cl::opt<bool> +TransformAll("aarch64-a57-fp-load-balancing-force-all", + cl::desc("Always modify dest registers regardless of color"), + cl::init(false), cl::Hidden); + +// Never use the balance information obtained from chains - return a specific +// color always. Used for testing. +static cl::opt<unsigned> +OverrideBalance("aarch64-a57-fp-load-balancing-override", + cl::desc("Ignore balance information, always return " + "(1: Even, 2: Odd)."), + cl::init(0), cl::Hidden); + +//===----------------------------------------------------------------------===// +// Helper functions + +// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs? +static bool isMul(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AArch64::FMULSrr: + case AArch64::FNMULSrr: + case AArch64::FMULDrr: + case AArch64::FNMULDrr: + return true; + default: + return false; + } +} + +// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs? +static bool isMla(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: + return true; + default: + return false; + } +} + +namespace llvm { +static void initializeAArch64A57FPLoadBalancingPass(PassRegistry &); +} + +//===----------------------------------------------------------------------===// + +namespace { +/// A "color", which is either even or odd. Yes, these aren't really colors +/// but the algorithm is conceptually doing two-color graph coloring. +enum class Color { Even, Odd }; +#ifndef NDEBUG +static const char *ColorNames[2] = { "Even", "Odd" }; +#endif + +class Chain; + +class AArch64A57FPLoadBalancing : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + RegisterClassInfo RCI; + +public: + static char ID; + explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) { + initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "A57 FP Anti-dependency breaker"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB); + bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB, + int &Balance); + bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB); + int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); + void scanInstruction(MachineInstr *MI, unsigned Idx, + std::map<unsigned, Chain*> &Active, + std::vector<std::unique_ptr<Chain>> &AllChains); + void maybeKillChain(MachineOperand &MO, unsigned Idx, + std::map<unsigned, Chain*> &RegChains); + Color getColor(unsigned Register); + Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); +}; +} + +char AArch64A57FPLoadBalancing::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE, + "AArch64 A57 FP Load-Balancing", false, false) +INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, + "AArch64 A57 FP Load-Balancing", false, false) + +namespace { +/// A Chain is a sequence of instructions that are linked together by +/// an accumulation operand. For example: +/// +/// fmul d0<def>, ? +/// fmla d1<def>, ?, ?, d0<kill> +/// fmla d2<def>, ?, ?, d1<kill> +/// +/// There may be other instructions interleaved in the sequence that +/// do not belong to the chain. These other instructions must not use +/// the "chain" register at any point. +/// +/// We currently only support chains where the "chain" operand is killed +/// at each link in the chain for simplicity. +/// A chain has three important instructions - Start, Last and Kill. +/// * The start instruction is the first instruction in the chain. +/// * Last is the final instruction in the chain. +/// * Kill may or may not be defined. If defined, Kill is the instruction +/// where the outgoing value of the Last instruction is killed. +/// This information is important as if we know the outgoing value is +/// killed with no intervening uses, we can safely change its register. +/// +/// Without a kill instruction, we must assume the outgoing value escapes +/// beyond our model and either must not change its register or must +/// create a fixup FMOV to keep the old register value consistent. +/// +class Chain { +public: + /// The important (marker) instructions. + MachineInstr *StartInst, *LastInst, *KillInst; + /// The index, from the start of the basic block, that each marker + /// appears. These are stored so we can do quick interval tests. + unsigned StartInstIdx, LastInstIdx, KillInstIdx; + /// All instructions in the chain. + std::set<MachineInstr*> Insts; + /// True if KillInst cannot be modified. If this is true, + /// we cannot change LastInst's outgoing register. + /// This will be true for tied values and regmasks. + bool KillIsImmutable; + /// The "color" of LastInst. This will be the preferred chain color, + /// as changing intermediate nodes is easy but changing the last + /// instruction can be more tricky. + Color LastColor; + + Chain(MachineInstr *MI, unsigned Idx, Color C) + : StartInst(MI), LastInst(MI), KillInst(nullptr), + StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0), + LastColor(C) { + Insts.insert(MI); + } + + /// Add a new instruction into the chain. The instruction's dest operand + /// has the given color. + void add(MachineInstr *MI, unsigned Idx, Color C) { + LastInst = MI; + LastInstIdx = Idx; + LastColor = C; + assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && + "Chain: broken invariant. A Chain can only be killed after its last " + "def"); + + Insts.insert(MI); + } + + /// Return true if MI is a member of the chain. + bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; } + + /// Return the number of instructions in the chain. + unsigned size() const { + return Insts.size(); + } + + /// Inform the chain that its last active register (the dest register of + /// LastInst) is killed by MI with no intervening uses or defs. + void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) { + KillInst = MI; + KillInstIdx = Idx; + KillIsImmutable = Immutable; + assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && + "Chain: broken invariant. A Chain can only be killed after its last " + "def"); + } + + /// Return the first instruction in the chain. + MachineInstr *getStart() const { return StartInst; } + /// Return the last instruction in the chain. + MachineInstr *getLast() const { return LastInst; } + /// Return the "kill" instruction (as set with setKill()) or NULL. + MachineInstr *getKill() const { return KillInst; } + /// Return an instruction that can be used as an iterator for the end + /// of the chain. This is the maximum of KillInst (if set) and LastInst. + MachineBasicBlock::iterator getEnd() const { + return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); + } + + /// Can the Kill instruction (assuming one exists) be modified? + bool isKillImmutable() const { return KillIsImmutable; } + + /// Return the preferred color of this chain. + Color getPreferredColor() { + if (OverrideBalance != 0) + return OverrideBalance == 1 ? Color::Even : Color::Odd; + return LastColor; + } + + /// Return true if this chain (StartInst..KillInst) overlaps with Other. + bool rangeOverlapsWith(const Chain &Other) const { + unsigned End = KillInst ? KillInstIdx : LastInstIdx; + unsigned OtherEnd = Other.KillInst ? + Other.KillInstIdx : Other.LastInstIdx; + + return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End; + } + + /// Return true if this chain starts before Other. + bool startsBefore(const Chain *Other) const { + return StartInstIdx < Other->StartInstIdx; + } + + /// Return true if the group will require a fixup MOV at the end. + bool requiresFixup() const { + return (getKill() && isKillImmutable()) || !getKill(); + } + + /// Return a simple string representation of the chain. + std::string str() const { + std::string S; + raw_string_ostream OS(S); + + OS << "{"; + StartInst->print(OS, /* SkipOpers= */true); + OS << " -> "; + LastInst->print(OS, /* SkipOpers= */true); + if (KillInst) { + OS << " (kill @ "; + KillInst->print(OS, /* SkipOpers= */true); + OS << ")"; + } + OS << "}"; + + return OS.str(); + } + +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { + // Don't do anything if this isn't an A53 or A57. + if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() || + F.getSubtarget<AArch64Subtarget>().isCortexA57())) + return false; + + bool Changed = false; + DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n"); + + MRI = &F.getRegInfo(); + TRI = F.getRegInfo().getTargetRegisterInfo(); + RCI.runOnMachineFunction(F); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB); + } + + return Changed; +} + +bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); + + // First, scan the basic block producing a set of chains. + + // The currently "active" chains - chains that can be added to and haven't + // been killed yet. This is keyed by register - all chains can only have one + // "link" register between each inst in the chain. + std::map<unsigned, Chain*> ActiveChains; + std::vector<std::unique_ptr<Chain>> AllChains; + unsigned Idx = 0; + for (auto &MI : MBB) + scanInstruction(&MI, Idx++, ActiveChains, AllChains); + + DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n"); + + // Group the chains into disjoint sets based on their liveness range. This is + // a poor-man's version of graph coloring. Ideally we'd create an interference + // graph and perform full-on graph coloring on that, but; + // (a) That's rather heavyweight for only two colors. + // (b) We expect multiple disjoint interference regions - in practice the live + // range of chains is quite small and they are clustered between loads + // and stores. + EquivalenceClasses<Chain*> EC; + for (auto &I : AllChains) + EC.insert(I.get()); + + for (auto &I : AllChains) + for (auto &J : AllChains) + if (I != J && I->rangeOverlapsWith(*J)) + EC.unionSets(I.get(), J.get()); + DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n"); + + // Now we assume that every member of an equivalence class interferes + // with every other member of that class, and with no members of other classes. + + // Convert the EquivalenceClasses to a simpler set of sets. + std::vector<std::vector<Chain*> > V; + for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { + std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end()); + if (Cs.empty()) continue; + V.push_back(std::move(Cs)); + } + + // Now we have a set of sets, order them by start address so + // we can iterate over them sequentially. + std::sort(V.begin(), V.end(), + [](const std::vector<Chain*> &A, + const std::vector<Chain*> &B) { + return A.front()->startsBefore(B.front()); + }); + + // As we only have two colors, we can track the global (BB-level) balance of + // odds versus evens. We aim to keep this near zero to keep both execution + // units fed. + // Positive means we're even-heavy, negative we're odd-heavy. + // + // FIXME: If chains have interdependencies, for example: + // mul r0, r1, r2 + // mul r3, r0, r1 + // We do not model this and may color each one differently, assuming we'll + // get ILP when we obviously can't. This hasn't been seen to be a problem + // in practice so far, so we simplify the algorithm by ignoring it. + int Parity = 0; + + for (auto &I : V) + Changed |= colorChainSet(std::move(I), MBB, Parity); + + return Changed; +} + +Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, + std::vector<Chain*> &L) { + if (L.empty()) + return nullptr; + + // We try and get the best candidate from L to color next, given that our + // preferred color is "PreferredColor". L is ordered from larger to smaller + // chains. It is beneficial to color the large chains before the small chains, + // but if we can't find a chain of the maximum length with the preferred color, + // we fuzz the size and look for slightly smaller chains before giving up and + // returning a chain that must be recolored. + + // FIXME: Does this need to be configurable? + const unsigned SizeFuzz = 1; + unsigned MinSize = L.front()->size() - SizeFuzz; + for (auto I = L.begin(), E = L.end(); I != E; ++I) { + if ((*I)->size() <= MinSize) { + // We've gone past the size limit. Return the previous item. + Chain *Ch = *--I; + L.erase(I); + return Ch; + } + + if ((*I)->getPreferredColor() == PreferredColor) { + Chain *Ch = *I; + L.erase(I); + return Ch; + } + } + + // Bailout case - just return the first item. + Chain *Ch = L.front(); + L.erase(L.begin()); + return Ch; +} + +bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, + MachineBasicBlock &MBB, + int &Parity) { + bool Changed = false; + DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n"); + + // Sort by descending size order so that we allocate the most important + // sets first. + // Tie-break equivalent sizes by sorting chains requiring fixups before + // those without fixups. The logic here is that we should look at the + // chains that we cannot change before we look at those we can, + // so the parity counter is updated and we know what color we should + // change them to! + // Final tie-break with instruction order so pass output is stable (i.e. not + // dependent on malloc'd pointer values). + std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) { + if (G1->size() != G2->size()) + return G1->size() > G2->size(); + if (G1->requiresFixup() != G2->requiresFixup()) + return G1->requiresFixup() > G2->requiresFixup(); + // Make sure startsBefore() produces a stable final order. + assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) && + "Starts before not total order!"); + return G1->startsBefore(G2); + }); + + Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; + while (Chain *G = getAndEraseNext(PreferredColor, GV)) { + // Start off by assuming we'll color to our own preferred color. + Color C = PreferredColor; + if (Parity == 0) + // But if we really don't care, use the chain's preferred color. + C = G->getPreferredColor(); + + DEBUG(dbgs() << " - Parity=" << Parity << ", Color=" + << ColorNames[(int)C] << "\n"); + + // If we'll need a fixup FMOV, don't bother. Testing has shown that this + // happens infrequently and when it does it has at least a 50% chance of + // slowing code down instead of speeding it up. + if (G->requiresFixup() && C != G->getPreferredColor()) { + C = G->getPreferredColor(); + DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; " + "color remains " << ColorNames[(int)C] << "\n"); + } + + Changed |= colorChain(G, C, MBB); + + Parity += (C == Color::Even) ? G->size() : -G->size(); + PreferredColor = Parity < 0 ? Color::Even : Color::Odd; + } + + return Changed; +} + +int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, + MachineBasicBlock &MBB) { + RegScavenger RS; + RS.enterBasicBlock(&MBB); + RS.forward(MachineBasicBlock::iterator(G->getStart())); + + // Can we find an appropriate register that is available throughout the life + // of the chain? + unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; + BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); + for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); + I != E; ++I) { + RS.forward(I); + AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); + + // Remove any registers clobbered by a regmask or any def register that is + // immediately dead. + for (auto J : I->operands()) { + if (J.isRegMask()) + AvailableRegs.clearBitsNotInMask(J.getRegMask()); + + if (J.isReg() && J.isDef()) { + MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true); + if (J.isDead()) + for (; AI.isValid(); ++AI) + AvailableRegs.reset(*AI); +#ifndef NDEBUG + else + for (; AI.isValid(); ++AI) + assert(!AvailableRegs[*AI] && + "Non-dead def should have been removed by now!"); +#endif + } + } + } + + // Make sure we allocate in-order, to get the cheapest registers first. + auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID)); + for (auto Reg : Ord) { + if (!AvailableRegs[Reg]) + continue; + if ((C == Color::Even && (Reg % 2) == 0) || + (C == Color::Odd && (Reg % 2) == 1)) + return Reg; + } + + return -1; +} + +bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, + MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << " - colorChain(" << G->str() << ", " + << ColorNames[(int)C] << ")\n"); + + // Try and obtain a free register of the right class. Without a register + // to play with we cannot continue. + int Reg = scavengeRegister(G, C, MBB); + if (Reg == -1) { + DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n"); + return false; + } + DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); + + std::map<unsigned, unsigned> Substs; + for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); + I != E; ++I) { + if (!G->contains(I) && + (&*I != G->getKill() || G->isKillImmutable())) + continue; + + // I is a member of G, or I is a mutable instruction that kills G. + + std::vector<unsigned> ToErase; + for (auto &U : I->operands()) { + if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { + unsigned OrigReg = U.getReg(); + U.setReg(Substs[OrigReg]); + if (U.isKill()) + // Don't erase straight away, because there may be other operands + // that also reference this substitution! + ToErase.push_back(OrigReg); + } else if (U.isRegMask()) { + for (auto J : Substs) { + if (U.clobbersPhysReg(J.first)) + ToErase.push_back(J.first); + } + } + } + // Now it's safe to remove the substs identified earlier. + for (auto J : ToErase) + Substs.erase(J); + + // Only change the def if this isn't the last instruction. + if (&*I != G->getKill()) { + MachineOperand &MO = I->getOperand(0); + + bool Change = TransformAll || getColor(MO.getReg()) != C; + if (G->requiresFixup() && &*I == G->getLast()) + Change = false; + + if (Change) { + Substs[MO.getReg()] = Reg; + MO.setReg(Reg); + + Changed = true; + } + } + } + assert(Substs.size() == 0 && "No substitutions should be left active!"); + + if (G->getKill()) { + DEBUG(dbgs() << " - Kill instruction seen.\n"); + } else { + // We didn't have a kill instruction, but we didn't seem to need to change + // the destination register anyway. + DEBUG(dbgs() << " - Destination register not changed.\n"); + } + return Changed; +} + +void AArch64A57FPLoadBalancing::scanInstruction( + MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains, + std::vector<std::unique_ptr<Chain>> &AllChains) { + // Inspect "MI", updating ActiveChains and AllChains. + + if (isMul(MI)) { + + for (auto &I : MI->uses()) + maybeKillChain(I, Idx, ActiveChains); + for (auto &I : MI->defs()) + maybeKillChain(I, Idx, ActiveChains); + + // Create a new chain. Multiplies don't require forwarding so can go on any + // unit. + unsigned DestReg = MI->getOperand(0).getReg(); + + DEBUG(dbgs() << "New chain started for register " + << TRI->getName(DestReg) << " at " << *MI); + + auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + ActiveChains[DestReg] = G.get(); + AllChains.push_back(std::move(G)); + + } else if (isMla(MI)) { + + // It is beneficial to keep MLAs on the same functional unit as their + // accumulator operand. + unsigned DestReg = MI->getOperand(0).getReg(); + unsigned AccumReg = MI->getOperand(3).getReg(); + + maybeKillChain(MI->getOperand(1), Idx, ActiveChains); + maybeKillChain(MI->getOperand(2), Idx, ActiveChains); + if (DestReg != AccumReg) + maybeKillChain(MI->getOperand(0), Idx, ActiveChains); + + if (ActiveChains.find(AccumReg) != ActiveChains.end()) { + DEBUG(dbgs() << "Chain found for accumulator register " + << TRI->getName(AccumReg) << " in MI " << *MI); + + // For simplicity we only chain together sequences of MULs/MLAs where the + // accumulator register is killed on each instruction. This means we don't + // need to track other uses of the registers we want to rewrite. + // + // FIXME: We could extend to handle the non-kill cases for more coverage. + if (MI->getOperand(3).isKill()) { + // Add to chain. + DEBUG(dbgs() << "Instruction was successfully added to chain.\n"); + ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg)); + // Handle cases where the destination is not the same as the accumulator. + if (DestReg != AccumReg) { + ActiveChains[DestReg] = ActiveChains[AccumReg]; + ActiveChains.erase(AccumReg); + } + return; + } + + DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't " + << "marked <kill>!\n"); + maybeKillChain(MI->getOperand(3), Idx, ActiveChains); + } + + DEBUG(dbgs() << "Creating new chain for dest register " + << TRI->getName(DestReg) << "\n"); + auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + ActiveChains[DestReg] = G.get(); + AllChains.push_back(std::move(G)); + + } else { + + // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs + // lists. + for (auto &I : MI->uses()) + maybeKillChain(I, Idx, ActiveChains); + for (auto &I : MI->defs()) + maybeKillChain(I, Idx, ActiveChains); + + } +} + +void AArch64A57FPLoadBalancing:: +maybeKillChain(MachineOperand &MO, unsigned Idx, + std::map<unsigned, Chain*> &ActiveChains) { + // Given an operand and the set of active chains (keyed by register), + // determine if a chain should be ended and remove from ActiveChains. + MachineInstr *MI = MO.getParent(); + + if (MO.isReg()) { + + // If this is a KILL of a current chain, record it. + if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) { + DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg()) + << "\n"); + ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); + } + ActiveChains.erase(MO.getReg()); + + } else if (MO.isRegMask()) { + + for (auto I = ActiveChains.begin(), E = ActiveChains.end(); + I != E;) { + if (MO.clobbersPhysReg(I->first)) { + DEBUG(dbgs() << "Kill (regmask) seen for chain " + << TRI->getName(I->first) << "\n"); + I->second->setKill(MI, Idx, /*Immutable=*/true); + ActiveChains.erase(I++); + } else + ++I; + } + + } +} + +Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) { + if ((TRI->getEncodingValue(Reg) % 2) == 0) + return Color::Even; + else + return Color::Odd; +} + +// Factory function used by AArch64TargetMachine to add the pass to the passmanager. +FunctionPass *llvm::createAArch64A57FPLoadBalancing() { + return new AArch64A57FPLoadBalancing(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp new file mode 100644 index 0000000..3afcdfb --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -0,0 +1,494 @@ +//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to promote the computations use to obtained a sign extended +// value used into memory accesses. +// E.g. +// a = add nsw i32 b, 3 +// d = sext i32 a to i64 +// e = getelementptr ..., i64 d +// +// => +// f = sext i32 b to i64 +// a = add nsw i64 f, 3 +// e = getelementptr ..., i64 a +// +// This is legal to do if the computations are marked with either nsw or nuw +// markers. +// Moreover, the current heuristic is simple: it does not create new sext +// operations, i.e., it gives up when a sext would have forked (e.g., if +// a = add i32 b, c, two sexts are required to promote the computation). +// +// FIXME: This pass may be useful for other targets too. +// ===---------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-type-promotion" + +static cl::opt<bool> +EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden, + cl::desc("Enable the type promotion pass"), + cl::init(true)); +static cl::opt<bool> +EnableMerge("aarch64-type-promotion-merge", cl::Hidden, + cl::desc("Enable merging of redundant sexts when one is dominating" + " the other."), + cl::init(true)); + +#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion" + +//===----------------------------------------------------------------------===// +// AArch64AddressTypePromotion +//===----------------------------------------------------------------------===// + +namespace llvm { +void initializeAArch64AddressTypePromotionPass(PassRegistry &); +} + +namespace { +class AArch64AddressTypePromotion : public FunctionPass { + +public: + static char ID; + AArch64AddressTypePromotion() + : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) { + initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return AARCH64_TYPE_PROMO_NAME; + } + + /// Iterate over the functions and promote the computation of interesting + // sext instructions. + bool runOnFunction(Function &F) override; + +private: + /// The current function. + Function *Func; + /// Filter out all sexts that does not have this type. + /// Currently initialized with Int64Ty. + Type *ConsideredSExtType; + + // This transformation requires dominator info. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + typedef SmallPtrSet<Instruction *, 32> SetOfInstructions; + typedef SmallVector<Instruction *, 16> Instructions; + typedef DenseMap<Value *, Instructions> ValueToInsts; + + /// Check if it is profitable to move a sext through this instruction. + /// Currently, we consider it is profitable if: + /// - Inst is used only once (no need to insert truncate). + /// - Inst has only one operand that will require a sext operation (we do + /// do not create new sext operation). + bool shouldGetThrough(const Instruction *Inst); + + /// Check if it is possible and legal to move a sext through this + /// instruction. + /// Current heuristic considers that we can get through: + /// - Arithmetic operation marked with the nsw or nuw flag. + /// - Other sext operation. + /// - Truncate operation if it was just dropping sign extended bits. + bool canGetThrough(const Instruction *Inst); + + /// Move sext operations through safe to sext instructions. + bool propagateSignExtension(Instructions &SExtInsts); + + /// Is this sext should be considered for code motion. + /// We look for sext with ConsideredSExtType and uses in at least one + // GetElementPtrInst. + bool shouldConsiderSExt(const Instruction *SExt) const; + + /// Collect all interesting sext operations, i.e., the ones with the right + /// type and used in memory accesses. + /// More precisely, a sext instruction is considered as interesting if it + /// is used in a "complex" getelementptr or it exits at least another + /// sext instruction that sign extended the same initial value. + /// A getelementptr is considered as "complex" if it has more than 2 + // operands. + void analyzeSExtension(Instructions &SExtInsts); + + /// Merge redundant sign extension operations in common dominator. + void mergeSExts(ValueToInsts &ValToSExtendedUses, + SetOfInstructions &ToRemove); +}; +} // end anonymous namespace. + +char AArch64AddressTypePromotion::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", + AARCH64_TYPE_PROMO_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", + AARCH64_TYPE_PROMO_NAME, false, false) + +FunctionPass *llvm::createAArch64AddressTypePromotionPass() { + return new AArch64AddressTypePromotion(); +} + +bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) { + if (isa<SExtInst>(Inst)) + return true; + + const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); + if (BinOp && isa<OverflowingBinaryOperator>(BinOp) && + (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap())) + return true; + + // sext(trunc(sext)) --> sext + if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) { + const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0)); + // Check that the truncate just drop sign extended bits. + if (Inst->getType()->getIntegerBitWidth() >= + Opnd->getOperand(0)->getType()->getIntegerBitWidth() && + Inst->getOperand(0)->getType()->getIntegerBitWidth() <= + ConsideredSExtType->getIntegerBitWidth()) + return true; + } + + return false; +} + +bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { + // If the type of the sext is the same as the considered one, this sext + // will become useless. + // Otherwise, we will have to do something to preserve the original value, + // unless it is used once. + if (isa<SExtInst>(Inst) && + (Inst->getType() == ConsideredSExtType || Inst->hasOneUse())) + return true; + + // If the Inst is used more that once, we may need to insert truncate + // operations and we don't do that at the moment. + if (!Inst->hasOneUse()) + return false; + + // This truncate is used only once, thus if we can get thourgh, it will become + // useless. + if (isa<TruncInst>(Inst)) + return true; + + // If both operands are not constant, a new sext will be created here. + // Current heuristic is: each step should be profitable. + // Therefore we don't allow to increase the number of sext even if it may + // be profitable later on. + if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1))) + return true; + + return false; +} + +static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { + if (isa<SelectInst>(Inst) && OpIdx == 0) + return false; + return true; +} + +bool +AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { + if (SExt->getType() != ConsideredSExtType) + return false; + + for (const User *U : SExt->users()) { + if (isa<GetElementPtrInst>(U)) + return true; + } + + return false; +} + +// Input: +// - SExtInsts contains all the sext instructions that are used directly in +// GetElementPtrInst, i.e., access to memory. +// Algorithm: +// - For each sext operation in SExtInsts: +// Let var be the operand of sext. +// while it is profitable (see shouldGetThrough), legal, and safe +// (see canGetThrough) to move sext through var's definition: +// * promote the type of var's definition. +// * fold var into sext uses. +// * move sext above var's definition. +// * update sext operand to use the operand of var that should be sign +// extended (by construction there is only one). +// +// E.g., +// a = ... i32 c, 3 +// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a' +// ... +// = b +// => Yes, update the code +// b = sext i32 c to i64 +// a = ... i64 b, 3 +// ... +// = a +// Iterate on 'c'. +bool +AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { + DEBUG(dbgs() << "*** Propagate Sign Extension ***\n"); + + bool LocalChange = false; + SetOfInstructions ToRemove; + ValueToInsts ValToSExtendedUses; + while (!SExtInsts.empty()) { + // Get through simple chain. + Instruction *SExt = SExtInsts.pop_back_val(); + + DEBUG(dbgs() << "Consider:\n" << *SExt << '\n'); + + // If this SExt has already been merged continue. + if (SExt->use_empty() && ToRemove.count(SExt)) { + DEBUG(dbgs() << "No uses => marked as delete\n"); + continue; + } + + // Now try to get through the chain of definitions. + while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) { + DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); + if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { + // We cannot get through something that is not an Instruction + // or not safe to SExt. + DEBUG(dbgs() << "Cannot get through\n"); + break; + } + + LocalChange = true; + // If this is a sign extend, it becomes useless. + if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) { + DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n"); + // We cannot use replaceAllUsesWith here because we may trigger some + // assertion on the type as all involved sext operation may have not + // been moved yet. + while (!Inst->use_empty()) { + Use &U = *Inst->use_begin(); + Instruction *User = dyn_cast<Instruction>(U.getUser()); + assert(User && "User of sext is not an Instruction!"); + User->setOperand(U.getOperandNo(), SExt); + } + ToRemove.insert(Inst); + SExt->setOperand(0, Inst->getOperand(0)); + SExt->moveBefore(Inst); + continue; + } + + // Get through the Instruction: + // 1. Update its type. + // 2. Replace the uses of SExt by Inst. + // 3. Sign extend each operand that needs to be sign extended. + + // Step #1. + Inst->mutateType(SExt->getType()); + // Step #2. + SExt->replaceAllUsesWith(Inst); + // Step #3. + Instruction *SExtForOpnd = SExt; + + DEBUG(dbgs() << "Propagate SExt to operands\n"); + for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx; + ++OpIdx) { + DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n'); + if (Inst->getOperand(OpIdx)->getType() == SExt->getType() || + !shouldSExtOperand(Inst, OpIdx)) { + DEBUG(dbgs() << "No need to propagate\n"); + continue; + } + // Check if we can statically sign extend the operand. + Value *Opnd = Inst->getOperand(OpIdx); + if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) { + DEBUG(dbgs() << "Statically sign extend\n"); + Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(), + Cst->getSExtValue())); + continue; + } + // UndefValue are typed, so we have to statically sign extend them. + if (isa<UndefValue>(Opnd)) { + DEBUG(dbgs() << "Statically sign extend\n"); + Inst->setOperand(OpIdx, UndefValue::get(SExt->getType())); + continue; + } + + // Otherwise we have to explicity sign extend it. + assert(SExtForOpnd && + "Only one operand should have been sign extended"); + + SExtForOpnd->setOperand(0, Opnd); + + DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n"); + // Move the sign extension before the insertion point. + SExtForOpnd->moveBefore(Inst); + Inst->setOperand(OpIdx, SExtForOpnd); + // If more sext are required, new instructions will have to be created. + SExtForOpnd = nullptr; + } + if (SExtForOpnd == SExt) { + DEBUG(dbgs() << "Sign extension is useless now\n"); + ToRemove.insert(SExt); + break; + } + } + + // If the use is already of the right type, connect its uses to its argument + // and delete it. + // This can happen for an Instruction all uses of which are sign extended. + if (!ToRemove.count(SExt) && + SExt->getType() == SExt->getOperand(0)->getType()) { + DEBUG(dbgs() << "Sign extension is useless, attach its use to " + "its argument\n"); + SExt->replaceAllUsesWith(SExt->getOperand(0)); + ToRemove.insert(SExt); + } else + ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt); + } + + if (EnableMerge) + mergeSExts(ValToSExtendedUses, ToRemove); + + // Remove all instructions marked as ToRemove. + for (Instruction *I: ToRemove) + I->eraseFromParent(); + return LocalChange; +} + +void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, + SetOfInstructions &ToRemove) { + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + for (auto &Entry : ValToSExtendedUses) { + Instructions &Insts = Entry.second; + Instructions CurPts; + for (Instruction *Inst : Insts) { + if (ToRemove.count(Inst)) + continue; + bool inserted = false; + for (auto &Pt : CurPts) { + if (DT.dominates(Inst, Pt)) { + DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" + << *Inst << '\n'); + Pt->replaceAllUsesWith(Inst); + ToRemove.insert(Pt); + Pt = Inst; + inserted = true; + break; + } + if (!DT.dominates(Pt, Inst)) + // Give up if we need to merge in a common dominator as the + // expermients show it is not profitable. + continue; + + DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n" + << *Pt << '\n'); + Inst->replaceAllUsesWith(Pt); + ToRemove.insert(Inst); + inserted = true; + break; + } + if (!inserted) + CurPts.push_back(Inst); + } + } +} + +void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { + DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n"); + + DenseMap<Value *, Instruction *> SeenChains; + + for (auto &BB : *Func) { + for (auto &II : BB) { + Instruction *SExt = &II; + + // Collect all sext operation per type. + if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt)) + continue; + + DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n'); + + // Cases where we actually perform the optimization: + // 1. SExt is used in a getelementptr with more than 2 operand => + // likely we can merge some computation if they are done on 64 bits. + // 2. The beginning of the SExt chain is SExt several time. => + // code sharing is possible. + + bool insert = false; + // #1. + for (const User *U : SExt->users()) { + const Instruction *Inst = dyn_cast<GetElementPtrInst>(U); + if (Inst && Inst->getNumOperands() > 2) { + DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst + << '\n'); + insert = true; + break; + } + } + + // #2. + // Check the head of the chain. + Instruction *Inst = SExt; + Value *Last; + do { + int OpdIdx = 0; + const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); + if (BinOp && isa<ConstantInt>(BinOp->getOperand(0))) + OpdIdx = 1; + Last = Inst->getOperand(OpdIdx); + Inst = dyn_cast<Instruction>(Last); + } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst)); + + DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n'); + DenseMap<Value *, Instruction *>::iterator AlreadySeen = + SeenChains.find(Last); + if (insert || AlreadySeen != SeenChains.end()) { + DEBUG(dbgs() << "Insert\n"); + SExtInsts.push_back(SExt); + if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) { + DEBUG(dbgs() << "Insert chain member\n"); + SExtInsts.push_back(AlreadySeen->second); + SeenChains[Last] = nullptr; + } + } else { + DEBUG(dbgs() << "Record its chain membership\n"); + SeenChains[Last] = SExt; + } + } + } +} + +bool AArch64AddressTypePromotion::runOnFunction(Function &F) { + if (!EnableAddressTypePromotion || F.isDeclaration()) + return false; + Func = &F; + ConsideredSExtType = Type::getInt64Ty(Func->getContext()); + + DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n'); + + Instructions SExtInsts; + analyzeSExtension(SExtInsts); + return propagateSignExtension(SExtInsts); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp new file mode 100644 index 0000000..1644d71 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -0,0 +1,404 @@ +//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// When profitable, replace GPR targeting i64 instructions with their +// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined +// as minimizing the number of cross-class register copies. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TODO: Graph based predicate heuristics. +// Walking the instruction list linearly will get many, perhaps most, of +// the cases, but to do a truly thorough job of this, we need a more +// wholistic approach. +// +// This optimization is very similar in spirit to the register allocator's +// spill placement, only here we're determining where to place cross-class +// register copies rather than spills. As such, a similar approach is +// called for. +// +// We want to build up a set of graphs of all instructions which are candidates +// for transformation along with instructions which generate their inputs and +// consume their outputs. For each edge in the graph, we assign a weight +// based on whether there is a copy required there (weight zero if not) and +// the block frequency of the block containing the defining or using +// instruction, whichever is less. Our optimization is then a graph problem +// to minimize the total weight of all the graphs, then transform instructions +// and add or remove copy instructions as called for to implement the +// solution. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-simd-scalar" + +// Allow forcing all i64 operations with equivalent SIMD instructions to use +// them. For stress-testing the transformation function. +static cl::opt<bool> +TransformAll("aarch64-simd-scalar-force-all", + cl::desc("Force use of AdvSIMD scalar instructions everywhere"), + cl::init(false), cl::Hidden); + +STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); +STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); +STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); + +namespace llvm { +void initializeAArch64AdvSIMDScalarPass(PassRegistry &); +} + +#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" + +namespace { +class AArch64AdvSIMDScalar : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + +private: + // isProfitableToTransform - Predicate function to determine whether an + // instruction should be transformed to its equivalent AdvSIMD scalar + // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. + bool isProfitableToTransform(const MachineInstr *MI) const; + + // transformInstruction - Perform the transformation of an instruction + // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs + // to be the correct register class, minimizing cross-class copies. + void transformInstruction(MachineInstr *MI); + + // processMachineBasicBlock - Main optimzation loop. + bool processMachineBasicBlock(MachineBasicBlock *MBB); + +public: + static char ID; // Pass identification, replacement for typeid. + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { + initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return AARCH64_ADVSIMD_NAME; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64AdvSIMDScalar::ID = 0; +} // end anonymous namespace + +INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", + AARCH64_ADVSIMD_NAME, false, false) + +static bool isGPR64(unsigned Reg, unsigned SubReg, + const MachineRegisterInfo *MRI) { + if (SubReg) + return false; + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); + return AArch64::GPR64RegClass.contains(Reg); +} + +static bool isFPR64(unsigned Reg, unsigned SubReg, + const MachineRegisterInfo *MRI) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && + SubReg == 0) || + (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && + SubReg == AArch64::dsub); + // Physical register references just check the register class directly. + return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) || + (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub); +} + +// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 +// copy instruction. Return zero_reg if the instruction is not a copy. +static unsigned getSrcFromCopy(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &SubReg) { + SubReg = 0; + // The "FMOV Xd, Dn" instruction is the typical form. + if (MI->getOpcode() == AArch64::FMOVDXr || + MI->getOpcode() == AArch64::FMOVXDr) + return MI->getOperand(1).getReg(); + // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see + // these at this stage, but it's easy to check for. + if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { + SubReg = AArch64::dsub; + return MI->getOperand(1).getReg(); + } + // Or just a plain COPY instruction. This can be directly to/from FPR64, + // or it can be a dsub subreg reference to an FPR128. + if (MI->getOpcode() == AArch64::COPY) { + if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), + MRI) && + isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) + return MI->getOperand(1).getReg(); + if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), + MRI) && + isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), + MRI)) { + SubReg = MI->getOperand(1).getSubReg(); + return MI->getOperand(1).getReg(); + } + } + + // Otherwise, this is some other kind of instruction. + return 0; +} + +// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent +// that we're considering transforming to, return that AdvSIMD opcode. For all +// others, return the original opcode. +static unsigned getTransformOpcode(unsigned Opc) { + switch (Opc) { + default: + break; + // FIXME: Lots more possibilities. + case AArch64::ADDXrr: + return AArch64::ADDv1i64; + case AArch64::SUBXrr: + return AArch64::SUBv1i64; + case AArch64::ANDXrr: + return AArch64::ANDv8i8; + case AArch64::EORXrr: + return AArch64::EORv8i8; + case AArch64::ORRXrr: + return AArch64::ORRv8i8; + } + // No AdvSIMD equivalent, so just return the original opcode. + return Opc; +} + +static bool isTransformable(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + return Opc != getTransformOpcode(Opc); +} + +// isProfitableToTransform - Predicate function to determine whether an +// instruction should be transformed to its equivalent AdvSIMD scalar +// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. +bool +AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { + // If this instruction isn't eligible to be transformed (no SIMD equivalent), + // early exit since that's the common case. + if (!isTransformable(MI)) + return false; + + // Count the number of copies we'll need to add and approximate the number + // of copies that a transform will enable us to remove. + unsigned NumNewCopies = 3; + unsigned NumRemovableCopies = 0; + + unsigned OrigSrc0 = MI->getOperand(1).getReg(); + unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned Src0 = 0, SubReg0; + unsigned Src1 = 0, SubReg1; + if (!MRI->def_empty(OrigSrc0)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc0); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + // If the source was from a copy, we don't need to insert a new copy. + if (Src0) + --NumNewCopies; + // If there are no other users of the original source, we can delete + // that instruction. + if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) + ++NumRemovableCopies; + } + if (!MRI->def_empty(OrigSrc1)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc1); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + if (Src1) + --NumNewCopies; + // If there are no other users of the original source, we can delete + // that instruction. + if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) + ++NumRemovableCopies; + } + + // If any of the uses of the original instructions is a cross class copy, + // that's a copy that will be removable if we transform. Likewise, if + // any of the uses is a transformable instruction, it's likely the tranforms + // will chain, enabling us to save a copy there, too. This is an aggressive + // heuristic that approximates the graph based cost analysis described above. + unsigned Dst = MI->getOperand(0).getReg(); + bool AllUsesAreCopies = true; + for (MachineRegisterInfo::use_instr_nodbg_iterator + Use = MRI->use_instr_nodbg_begin(Dst), + E = MRI->use_instr_nodbg_end(); + Use != E; ++Use) { + unsigned SubReg; + if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) + ++NumRemovableCopies; + // If the use is an INSERT_SUBREG, that's still something that can + // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's + // preferable to have it use the FPR64 in most cases, as if the source + // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. + // Ditto for a lane insert. + else if (Use->getOpcode() == AArch64::INSERT_SUBREG || + Use->getOpcode() == AArch64::INSvi64gpr) + ; + else + AllUsesAreCopies = false; + } + // If all of the uses of the original destination register are copies to + // FPR64, then we won't end up having a new copy back to GPR64 either. + if (AllUsesAreCopies) + --NumNewCopies; + + // If a transform will not increase the number of cross-class copies required, + // return true. + if (NumNewCopies <= NumRemovableCopies) + return true; + + // Finally, even if we otherwise wouldn't transform, check if we're forcing + // transformation of everything. + return TransformAll; +} + +static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI, + unsigned Dst, unsigned Src, bool IsKill) { + MachineInstrBuilder MIB = + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), + Dst) + .addReg(Src, getKillRegState(IsKill)); + DEBUG(dbgs() << " adding copy: " << *MIB); + ++NumCopiesInserted; + return MIB; +} + +// transformInstruction - Perform the transformation of an instruction +// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs +// to be the correct register class, minimizing cross-class copies. +void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { + DEBUG(dbgs() << "Scalar transform: " << *MI); + + MachineBasicBlock *MBB = MI->getParent(); + unsigned OldOpc = MI->getOpcode(); + unsigned NewOpc = getTransformOpcode(OldOpc); + assert(OldOpc != NewOpc && "transform an instruction to itself?!"); + + // Check if we need a copy for the source registers. + unsigned OrigSrc0 = MI->getOperand(1).getReg(); + unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned Src0 = 0, SubReg0; + unsigned Src1 = 0, SubReg1; + if (!MRI->def_empty(OrigSrc0)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc0); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + // If there are no other users of the original source, we can delete + // that instruction. + if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { + assert(Src0 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } + } + if (!MRI->def_empty(OrigSrc1)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc1); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + // If there are no other users of the original source, we can delete + // that instruction. + if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { + assert(Src1 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } + } + // If we weren't able to reference the original source directly, create a + // copy. + if (!Src0) { + SubReg0 = 0; + Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + insertCopy(TII, MI, Src0, OrigSrc0, true); + } + if (!Src1) { + SubReg1 = 0; + Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + insertCopy(TII, MI, Src1, OrigSrc1, true); + } + + // Create a vreg for the destination. + // FIXME: No need to do this if the ultimate user expects an FPR64. + // Check for that and avoid the copy if possible. + unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + + // For now, all of the new instructions have the same simple three-register + // form, so no need to special case based on what instruction we're + // building. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) + .addReg(Src0, getKillRegState(true), SubReg0) + .addReg(Src1, getKillRegState(true), SubReg1); + + // Now copy the result back out to a GPR. + // FIXME: Try to avoid this if all uses could actually just use the FPR64 + // directly. + insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); + + // Erase the old instruction. + MI->eraseFromParent(); + + ++NumScalarInsnsUsed; +} + +// processMachineBasicBlock - Main optimzation loop. +bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr *MI = I; + ++I; + if (isProfitableToTransform(MI)) { + transformInstruction(MI); + Changed = true; + } + } + return Changed; +} + +// runOnMachineFunction - Pass entry point from PassManager. +bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { + bool Changed = false; + DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); + + MRI = &mf.getRegInfo(); + TII = mf.getSubtarget().getInstrInfo(); + + // Just check things on a one-block-at-a-time basis. + for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) + if (processMachineBasicBlock(&*I)) + Changed = true; + return Changed; +} + +// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine +// to add the pass to the PassManager. +FunctionPass *llvm::createAArch64AdvSIMDScalar() { + return new AArch64AdvSIMDScalar(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp new file mode 100644 index 0000000..ada995b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -0,0 +1,556 @@ +//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to the AArch64 assembly language. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "AArch64.h" +#include "AArch64MCInstLower.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" +#include "InstPrinter/AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCLinkerOptimizationHint.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +namespace { + +class AArch64AsmPrinter : public AsmPrinter { + AArch64MCInstLower MCInstLowering; + StackMaps SM; + +public: + AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), + SM(*this), AArch64FI(nullptr) {} + + const char *getPassName() const override { + return "AArch64 Assembly Printer"; + } + + /// \brief Wrapper for MCInstLowering.lowerOperand() for the + /// tblgen'erated pseudo lowering. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { + return MCInstLowering.lowerOperand(MO, MCOp); + } + + void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + /// \brief tblgen'erated driver function for lowering simple MI->MC + /// pseudo instructions. + bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, + const MachineInstr *MI); + + void EmitInstruction(const MachineInstr *MI) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AsmPrinter::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + + bool runOnMachineFunction(MachineFunction &F) override { + AArch64FI = F.getInfo<AArch64FunctionInfo>(); + return AsmPrinter::runOnMachineFunction(F); + } + +private: + MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); + bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); + bool printAsmRegInClass(const MachineOperand &MO, + const TargetRegisterClass *RC, bool isVector, + raw_ostream &O); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + void EmitFunctionBodyEnd() override; + + MCSymbol *GetCPISymbol(unsigned CPID) const override; + void EmitEndOfAsmFile(Module &M) override; + AArch64FunctionInfo *AArch64FI; + + /// \brief Emit the LOHs contained in AArch64FI. + void EmitLOHs(); + + typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol; + MInstToMCSymbol LOHInstToLabel; +}; + +} // end of anonymous namespace + +//===----------------------------------------------------------------------===// + +void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { + const Triple &TT = TM.getTargetTriple(); + if (TT.isOSBinFormatMachO()) { + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + SM.serializeToStackMapSection(); + } +} + +MachineLocation +AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} + +void AArch64AsmPrinter::EmitLOHs() { + SmallVector<MCSymbol *, 3> MCArgs; + + for (const auto &D : AArch64FI->getLOHContainer()) { + for (const MachineInstr *MI : D.getArgs()) { + MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI); + assert(LabelIt != LOHInstToLabel.end() && + "Label hasn't been inserted for LOH related instruction"); + MCArgs.push_back(LabelIt->second); + } + OutStreamer->EmitLOHDirective(D.getKind(), MCArgs); + MCArgs.clear(); + } +} + +void AArch64AsmPrinter::EmitFunctionBodyEnd() { + if (!AArch64FI->getLOHRelated().empty()) + EmitLOHs(); +} + +/// GetCPISymbol - Return the symbol for the specified constant pool entry. +MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const { + // Darwin uses a linker-private symbol name for constant-pools (to + // avoid addends on the relocation?), ELF has no such concept and + // uses a normal private symbol. + if (getDataLayout().getLinkerPrivateGlobalPrefix()[0]) + return OutContext.getOrCreateSymbol( + Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" + + Twine(getFunctionNumber()) + "_" + Twine(CPID)); + + return OutContext.getOrCreateSymbol( + Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" + + Twine(getFunctionNumber()) + "_" + Twine(CPID)); +} + +void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + switch (MO.getType()) { + default: + llvm_unreachable("<unknown operand type>"); + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + O << AArch64InstPrinter::getRegisterName(Reg); + break; + } + case MachineOperand::MO_Immediate: { + int64_t Imm = MO.getImm(); + O << '#' << Imm; + break; + } + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = getSymbol(GV); + + // FIXME: Can we get anything other than a plain symbol here? + assert(!MO.getTargetFlags() && "Unknown operand target flag!"); + + Sym->print(O, MAI); + printOffset(MO.getOffset(), O); + break; + } + } +} + +bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, + raw_ostream &O) { + unsigned Reg = MO.getReg(); + switch (Mode) { + default: + return true; // Unknown mode. + case 'w': + Reg = getWRegFromXReg(Reg); + break; + case 'x': + Reg = getXRegFromWReg(Reg); + break; + } + + O << AArch64InstPrinter::getRegisterName(Reg); + return false; +} + +// Prints the register in MO using class RC using the offset in the +// new register class. This should not be used for cross class +// printing. +bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, + const TargetRegisterClass *RC, + bool isVector, raw_ostream &O) { + assert(MO.isReg() && "Should only get here with a register!"); + const AArch64RegisterInfo *RI = + MF->getSubtarget<AArch64Subtarget>().getRegisterInfo(); + unsigned Reg = MO.getReg(); + unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); + assert(RI->regsOverlap(RegToPrint, Reg)); + O << AArch64InstPrinter::getRegisterName( + RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName); + return false; +} + +bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + + // First try the generic code, which knows about modifiers like 'c' and 'n'. + if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O)) + return false; + + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + return true; // Unknown modifier. + case 'w': // Print W register + case 'x': // Print X register + if (MO.isReg()) + return printAsmMRegister(MO, ExtraCode[0], O); + if (MO.isImm() && MO.getImm() == 0) { + unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR; + O << AArch64InstPrinter::getRegisterName(Reg); + return false; + } + printOperand(MI, OpNum, O); + return false; + case 'b': // Print B register. + case 'h': // Print H register. + case 's': // Print S register. + case 'd': // Print D register. + case 'q': // Print Q register. + if (MO.isReg()) { + const TargetRegisterClass *RC; + switch (ExtraCode[0]) { + case 'b': + RC = &AArch64::FPR8RegClass; + break; + case 'h': + RC = &AArch64::FPR16RegClass; + break; + case 's': + RC = &AArch64::FPR32RegClass; + break; + case 'd': + RC = &AArch64::FPR64RegClass; + break; + case 'q': + RC = &AArch64::FPR128RegClass; + break; + default: + return true; + } + return printAsmRegInClass(MO, RC, false /* vector */, O); + } + printOperand(MI, OpNum, O); + return false; + } + } + + // According to ARM, we should emit x and v registers unless we have a + // modifier. + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + + // If this is a w or x register, print an x register. + if (AArch64::GPR32allRegClass.contains(Reg) || + AArch64::GPR64allRegClass.contains(Reg)) + return printAsmMRegister(MO, 'x', O); + + // If this is a b, h, s, d, or q register, print it as a v register. + return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */, + O); + } + + printOperand(MI, OpNum, O); + return false; +} + +bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]"; + return false; +} + +void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &OS) { + unsigned NOps = MI->getNumOperands(); + assert(NOps == 4); + OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata()) + ->getName(); + OS << " <- "; + // Frame address. Currently handles register +- offset only. + assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + OS << '['; + printOperand(MI, 0, OS); + OS << '+'; + printOperand(MI, 1, OS); + OS << ']'; + OS << "+"; + printOperand(MI, NOps - 2, OS); +} + +void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + unsigned NumNOPBytes = MI.getOperand(1).getImm(); + + SM.recordStackMap(MI); + assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + + // Scan ahead to trim the shadow. + const MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::const_iterator MII(MI); + ++MII; + while (NumNOPBytes > 0) { + if (MII == MBB.end() || MII->isCall() || + MII->getOpcode() == AArch64::DBG_VALUE || + MII->getOpcode() == TargetOpcode::PATCHPOINT || + MII->getOpcode() == TargetOpcode::STACKMAP) + break; + ++MII; + NumNOPBytes -= 4; + } + + // Emit nops. + for (unsigned i = 0; i < NumNOPBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); +} + +// Lower a patchpoint of the form: +// [<def>], <id>, <numBytes>, <target>, <numArgs> +void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + SM.recordPatchPoint(MI); + + PatchPointOpers Opers(&MI); + + int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + unsigned EncodedBytes = 0; + if (CallTarget) { + assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && + "High 16 bits of call target should be zero."); + unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + EncodedBytes = 16; + // Materialize the jump address: + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi) + .addReg(ScratchReg) + .addImm((CallTarget >> 32) & 0xFFFF) + .addImm(32)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm((CallTarget >> 16) & 0xFFFF) + .addImm(16)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(CallTarget & 0xFFFF) + .addImm(0)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg)); + } + // Emit padding. + unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + assert(NumBytes >= EncodedBytes && + "Patchpoint can't request size less than the length of a call."); + assert((NumBytes - EncodedBytes) % 4 == 0 && + "Invalid number of NOP bytes requested!"); + for (unsigned i = EncodedBytes; i < NumBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); +} + +// Simple pseudo-instructions have their lowering (with expansion to real +// instructions) auto-generated. +#include "AArch64GenMCPseudoLowering.inc" + +void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { + // Do any auto-generated pseudo lowerings. + if (emitPseudoExpansionLowering(*OutStreamer, MI)) + return; + + if (AArch64FI->getLOHRelated().count(MI)) { + // Generate a label for LOH related instruction + MCSymbol *LOHLabel = createTempSymbol("loh"); + // Associate the instruction with the label + LOHInstToLabel[MI] = LOHLabel; + OutStreamer->EmitLabel(LOHLabel); + } + + // Do any manual lowerings. + switch (MI->getOpcode()) { + default: + break; + case AArch64::DBG_VALUE: { + if (isVerbose() && OutStreamer->hasRawTextSupport()) { + SmallString<128> TmpStr; + raw_svector_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer->EmitRawText(StringRef(OS.str())); + } + return; + } + + // Tail calls use pseudo instructions so they have the proper code-gen + // attributes (isCall, isReturn, etc.). We lower them to the real + // instruction here. + case AArch64::TCRETURNri: { + MCInst TmpInst; + TmpInst.setOpcode(AArch64::BR); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case AArch64::TCRETURNdi: { + MCOperand Dest; + MCInstLowering.lowerOperand(MI->getOperand(0), Dest); + MCInst TmpInst; + TmpInst.setOpcode(AArch64::B); + TmpInst.addOperand(Dest); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + case AArch64::TLSDESC_CALLSEQ: { + /// lower this to: + /// adrp x0, :tlsdesc:var + /// ldr x1, [x0, #:tlsdesc_lo12:var] + /// add x0, x0, #:tlsdesc_lo12:var + /// .tlsdesccall var + /// blr x1 + /// (TPIDR_EL0 offset now in x0) + const MachineOperand &MO_Sym = MI->getOperand(0); + MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym); + MCOperand Sym, SymTLSDescLo12, SymTLSDesc; + MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE); + MCInstLowering.lowerOperand(MO_Sym, Sym); + MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12); + MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc); + + MCInst Adrp; + Adrp.setOpcode(AArch64::ADRP); + Adrp.addOperand(MCOperand::createReg(AArch64::X0)); + Adrp.addOperand(SymTLSDesc); + EmitToStreamer(*OutStreamer, Adrp); + + MCInst Ldr; + Ldr.setOpcode(AArch64::LDRXui); + Ldr.addOperand(MCOperand::createReg(AArch64::X1)); + Ldr.addOperand(MCOperand::createReg(AArch64::X0)); + Ldr.addOperand(SymTLSDescLo12); + Ldr.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, Ldr); + + MCInst Add; + Add.setOpcode(AArch64::ADDXri); + Add.addOperand(MCOperand::createReg(AArch64::X0)); + Add.addOperand(MCOperand::createReg(AArch64::X0)); + Add.addOperand(SymTLSDescLo12); + Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0))); + EmitToStreamer(*OutStreamer, Add); + + // Emit a relocation-annotation. This expands to no code, but requests + // the following instruction gets an R_AARCH64_TLSDESC_CALL. + MCInst TLSDescCall; + TLSDescCall.setOpcode(AArch64::TLSDESCCALL); + TLSDescCall.addOperand(Sym); + EmitToStreamer(*OutStreamer, TLSDescCall); + + MCInst Blr; + Blr.setOpcode(AArch64::BLR); + Blr.addOperand(MCOperand::createReg(AArch64::X1)); + EmitToStreamer(*OutStreamer, Blr); + + return; + } + + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(*OutStreamer, SM, *MI); + + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(*OutStreamer, SM, *MI); + } + + // Finally, do the automated lowerings for everything else. + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); +} + +// Force static initialization. +extern "C" void LLVMInitializeAArch64AsmPrinter() { + RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget); + RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget); + RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp new file mode 100644 index 0000000..a614f55 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -0,0 +1,520 @@ +//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-branch-relax" + +static cl::opt<bool> +BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true), + cl::desc("Relax out of range conditional branches")); + +static cl::opt<unsigned> +TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), + cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); + +static cl::opt<unsigned> +CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); + +static cl::opt<unsigned> +BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of Bcc instructions (DEBUG)")); + +STATISTIC(NumSplit, "Number of basic blocks split"); +STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); + +namespace llvm { +void initializeAArch64BranchRelaxationPass(PassRegistry &); +} + +#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass" + +namespace { +class AArch64BranchRelaxation : public MachineFunctionPass { + /// BasicBlockInfo - Information about the offset and size of a single + /// basic block. + struct BasicBlockInfo { + /// Offset - Distance from the beginning of the function to the beginning + /// of this basic block. + /// + /// The offset is always aligned as required by the basic block. + unsigned Offset; + + /// Size - Size of the basic block in bytes. If the block contains + /// inline assembly, this is a worst case estimate. + /// + /// The size does not include any alignment padding whether from the + /// beginning of the block, or from an aligned jump table at the end. + unsigned Size; + + BasicBlockInfo() : Offset(0), Size(0) {} + + /// Compute the offset immediately following this block. If LogAlign is + /// specified, return the offset the successor block will get if it has + /// this alignment. + unsigned postOffset(unsigned LogAlign = 0) const { + unsigned PO = Offset + Size; + unsigned Align = 1 << LogAlign; + return (PO + Align - 1) / Align * Align; + } + }; + + SmallVector<BasicBlockInfo, 16> BlockInfo; + + MachineFunction *MF; + const AArch64InstrInfo *TII; + + bool relaxBranchInstructions(); + void scanFunction(); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); + void adjustBlockOffsets(MachineBasicBlock &MBB); + bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool fixupConditionalBranch(MachineInstr *MI); + void computeBlockSize(const MachineBasicBlock &MBB); + unsigned getInstrOffset(MachineInstr *MI) const; + void dumpBBs(); + void verify(); + +public: + static char ID; + AArch64BranchRelaxation() : MachineFunctionPass(ID) { + initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return AARCH64_BR_RELAX_NAME; + } +}; +char AArch64BranchRelaxation::ID = 0; +} + +INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax", + AARCH64_BR_RELAX_NAME, false, false) + +/// verify - check BBOffsets, BBSizes, alignment of islands +void AArch64BranchRelaxation::verify() { +#ifndef NDEBUG + unsigned PrevNum = MF->begin()->getNumber(); + for (MachineBasicBlock &MBB : *MF) { + unsigned Align = MBB.getAlignment(); + unsigned Num = MBB.getNumber(); + assert(BlockInfo[Num].Offset % (1u << Align) == 0); + assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset); + PrevNum = Num; + } +#endif +} + +/// print block size and offset information - debugging +void AArch64BranchRelaxation::dumpBBs() { + for (auto &MBB : *MF) { + const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; + dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) + << format("size=%#x\n", BBI.Size); + } +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI(MBB); + // Can't fall off end of function. + auto NextBB = std::next(MBBI); + if (NextBB == MBB->getParent()->end()) + return false; + + for (MachineBasicBlock *S : MBB->successors()) + if (S == &*NextBB) + return true; + + return false; +} + +/// scanFunction - Do the initial scan of the function, building up +/// information about each block. +void AArch64BranchRelaxation::scanFunction() { + BlockInfo.clear(); + BlockInfo.resize(MF->getNumBlockIDs()); + + // First thing, compute the size of all basic blocks, and see if the function + // has any inline assembly in it. If so, we have to be conservative about + // alignment assumptions, as we don't know for sure the size of any + // instructions in the inline assembly. + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(MBB); + + // Compute block offsets and known bits. + adjustBlockOffsets(*MF->begin()); +} + +/// computeBlockSize - Compute the size for MBB. +/// This function updates BlockInfo directly. +void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) { + unsigned Size = 0; + for (const MachineInstr &MI : MBB) + Size += TII->GetInstSizeInBytes(&MI); + BlockInfo[MBB.getNumber()].Size = Size; +} + +/// getInstrOffset - Return the current offset of the specified machine +/// instruction from the start of the function. This offset changes as stuff is +/// moved around inside the function. +unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BlockInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->GetInstSizeInBytes(I); + } + return Offset; +} + +void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) { + unsigned PrevNum = Start.getNumber(); + for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) { + unsigned Num = MBB.getNumber(); + if (!Num) // block zero is never changed from offset zero. + continue; + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + unsigned LogAlign = MBB.getAlignment(); + BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign); + PrevNum = Num; + } +} + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +/// NOTE: Successor list of the original BB is out of date after this function, +/// and must be updated by the caller! Other transforms follow using this +/// utility function, so no point updating now rather than waiting. +MachineBasicBlock * +AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MF->insert(++OrigBB->getIterator(), NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB); + + // Insert an entry into BlockInfo to align it properly with the block numbers. + BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Figure out how large the OrigBB is. As the first half of the original + // block, it cannot contain a tablejump. The size includes + // the new jump we added. (It should be possible to do this without + // recounting everything, but it's very confusing, and this is rarely + // executed.) + computeBlockSize(*OrigBB); + + // Figure out how large the NewMBB is. As the second half of the original + // block, it may contain a tablejump. + computeBlockSize(*NewBB); + + // All BBOffsets following these blocks must be modified. + adjustBlockOffsets(*OrigBB); + + ++NumSplit; + + return NewBB; +} + +/// isBlockInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI, + MachineBasicBlock *DestBB, + unsigned Bits) { + unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2; + unsigned BrOffset = getInstrOffset(MI); + unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset; + + DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxOffs << " from " << getInstrOffset(MI) + << " to " << DestOffset << " offset " + << int(DestOffset - BrOffset) << "\t" << *MI); + + // Branch before the Dest. + if (BrOffset <= DestOffset) + return (DestOffset - BrOffset <= MaxOffs); + return (BrOffset - DestOffset <= MaxOffs); +} + +static bool isConditionalBranch(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::TBZW: + case AArch64::TBNZW: + case AArch64::TBZX: + case AArch64::TBNZX: + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::Bcc: + return true; + } +} + +static MachineBasicBlock *getDestBlock(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("unexpected opcode!"); + case AArch64::TBZW: + case AArch64::TBNZW: + case AArch64::TBZX: + case AArch64::TBNZX: + return MI->getOperand(2).getMBB(); + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::Bcc: + return MI->getOperand(1).getMBB(); + } +} + +static unsigned getOppositeConditionOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("unexpected opcode!"); + case AArch64::TBNZW: return AArch64::TBZW; + case AArch64::TBNZX: return AArch64::TBZX; + case AArch64::TBZW: return AArch64::TBNZW; + case AArch64::TBZX: return AArch64::TBNZX; + case AArch64::CBNZW: return AArch64::CBZW; + case AArch64::CBNZX: return AArch64::CBZX; + case AArch64::CBZW: return AArch64::CBNZW; + case AArch64::CBZX: return AArch64::CBNZX; + case AArch64::Bcc: return AArch64::Bcc; // Condition is an operand for Bcc. + } +} + +static unsigned getBranchDisplacementBits(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("unexpected opcode!"); + case AArch64::TBNZW: + case AArch64::TBZW: + case AArch64::TBNZX: + case AArch64::TBZX: + return TBZDisplacementBits; + case AArch64::CBNZW: + case AArch64::CBZW: + case AArch64::CBNZX: + case AArch64::CBZX: + return CBZDisplacementBits; + case AArch64::Bcc: + return BCCDisplacementBits; + } +} + +static inline void invertBccCondition(MachineInstr *MI) { + assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!"); + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm(); + CC = AArch64CC::getInvertedCondCode(CC); + MI->getOperand(0).setImm((int64_t)CC); +} + +/// fixupConditionalBranch - Fix up a conditional branch whose destination is +/// too far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { + MachineBasicBlock *DestBB = getDestBlock(MI); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // tbz L1 + // => + // tbnz L2 + // b L1 + // L2: + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + if (BMI != MI) { + if (std::next(MachineBasicBlock::iterator(MI)) == + std::prev(MBB->getLastNonDebugInstr()) && + BMI->getOpcode() == AArch64::B) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); + if (isBlockInRange(MI, NewDest, + getBranchDisplacementBits(MI->getOpcode()))) { + DEBUG(dbgs() << " Invert condition and swap its destination with " + << *BMI); + BMI->getOperand(0).setMBB(DestBB); + unsigned OpNum = (MI->getOpcode() == AArch64::TBZW || + MI->getOpcode() == AArch64::TBNZW || + MI->getOpcode() == AArch64::TBZX || + MI->getOpcode() == AArch64::TBNZX) + ? 2 + : 1; + MI->getOperand(OpNum).setMBB(NewDest); + MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode()))); + if (MI->getOpcode() == AArch64::Bcc) + invertBccCondition(MI); + return true; + } + } + } + + if (NeedSplit) { + // Analyze the branch so we know how to update the successor lists. + MachineBasicBlock *TBB, *FBB; + SmallVector<MachineOperand, 2> Cond; + TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false); + + MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size -= delta; + MBB->back().eraseFromParent(); + // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below + + // Update the successor lists according to the transformation to follow. + // Do it here since if there's no split, no update is needed. + MBB->replaceSuccessor(FBB, NewBB); + NewBB->addSuccessor(FBB); + } + MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB)); + + DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() + << ", invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + MachineInstrBuilder MIB = BuildMI( + MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode()))) + .addOperand(MI->getOperand(0)); + if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW || + MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX) + MIB.addOperand(MI->getOperand(1)); + if (MI->getOpcode() == AArch64::Bcc) + invertBccCondition(MIB); + MIB.addMBB(NextBB); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + + // Remove the old conditional branch. It may or may not still be in MBB. + BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + + // Finally, keep the block offsets up to date. + adjustBlockOffsets(*MBB); + return true; +} + +bool AArch64BranchRelaxation::relaxBranchInstructions() { + bool Changed = false; + // Relaxing branches involves creating new basic blocks, so re-eval + // end() for termination. + for (auto &MBB : *MF) { + MachineInstr *MI = MBB.getFirstTerminator(); + if (isConditionalBranch(MI->getOpcode()) && + !isBlockInRange(MI, getDestBlock(MI), + getBranchDisplacementBits(MI->getOpcode()))) { + fixupConditionalBranch(MI); + ++NumRelaxed; + Changed = true; + } + } + return Changed; +} + +bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + + // If the pass is disabled, just bail early. + if (!BranchRelaxation) + return false; + + DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n"); + + TII = (const AArch64InstrInfo *)MF->getSubtarget().getInstrInfo(); + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF->RenumberBlocks(); + + // Do the initial scan of the function, building up information about the + // sizes of each block. + scanFunction(); + + DEBUG(dbgs() << " Basic blocks before relaxation\n"); + DEBUG(dumpBBs()); + + bool MadeChange = false; + while (relaxBranchInstructions()) + MadeChange = true; + + // After a while, this might be made debug-only, but it is not expensive. + verify(); + + DEBUG(dbgs() << " Basic blocks after relaxation\n"); + DEBUG(dbgs() << '\n'; dumpBBs()); + + BlockInfo.clear(); + + return MadeChange; +} + +/// createAArch64BranchRelaxation - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createAArch64BranchRelaxation() { + return new AArch64BranchRelaxation(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h new file mode 100644 index 0000000..bc44bc5 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -0,0 +1,139 @@ +//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the AArch64 Calling Convention +// that aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace { +using namespace llvm; + +static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; + +static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, + MVT LocVT, ISD::ArgFlagsTy &ArgFlags, + CCState &State, unsigned SlotAlign) { + unsigned Size = LocVT.getSizeInBits() / 8; + unsigned StackAlign = + State.getMachineFunction().getDataLayout().getStackAlignment(); + unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + State.addLoc(It); + SlotAlign = 1; + } + + // All pending members have now been allocated + PendingMembers.clear(); + return true; +} + +/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An +/// [N x Ty] type must still be contiguous in memory though. +static bool CC_AArch64_Custom_Stack_Block( + unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); +} + +/// Given an [N x Ty] block, it should be passed in a consecutive sequence of +/// registers. If no such sequence is available, mark the rest of the registers +/// of that type as used and place the argument on the stack. +static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + ArrayRef<MCPhysReg> RegList; + if (LocVT.SimpleTy == MVT::i64) + RegList = XRegList; + else if (LocVT.SimpleTy == MVT::f16) + RegList = HRegList; + else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector()) + RegList = SRegList; + else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector()) + RegList = DRegList; + else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) + RegList = QRegList; + else { + // Not an array we want to split up after all. + return false; + } + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (auto &It : PendingMembers) { + It.convertToReg(RegResult); + State.addLoc(It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); +} + +} + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td new file mode 100644 index 0000000..388d64e --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -0,0 +1,312 @@ +//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for AArch64 architecture. +// +//===----------------------------------------------------------------------===// + +/// CCIfAlign - Match of the original alignment of the arg +class CCIfAlign<string Align, CCAction A> : + CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; +/// CCIfBigEndian - Match only if we're in big endian mode. +class CCIfBigEndian<CCAction A> : + CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS64 Calling Convention +//===----------------------------------------------------------------------===// + +def CC_AArch64_AAPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8], + CCBitConvertToType<f64>>>, + CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8], + CCBitConvertToType<f128>>>, + + // An SRet is passed in X8, not X0 like a normal pointer parameter. + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal<CCPassByVal<8, 8>>, + + // The 'nest' parameter, if any, is passed in X18. + // Darwin uses X18 as the platform register and hence 'nest' isn't currently + // supported there. + CCIfNest<CCAssignToReg<[X18]>>, + + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], + [X0, X1, X3, X5]>>>, + + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, + CCIfType<[i32, f32], CCAssignToStack<8, 8>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> +]>; + +def RetCC_AArch64_AAPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8], + CCBitConvertToType<f64>>>, + CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8], + CCBitConvertToType<f128>>>, + + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> +]>; + + +// Darwin uses a calling convention which differs in only two ways +// from the standard one at this level: +// + i128s (i.e. split i64s) don't need even registers. +// + Stack slots are sized as needed rather than being at least 64-bit. +def CC_AArch64_DarwinPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + + // An SRet is passed in X8, not X0 like a normal pointer parameter. + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal<CCPassByVal<8, 8>>, + + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], + CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6], + [W0, W1, W2, W3, W4, W5, W6]>>>, + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> +]>; + +def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType<v2i32>>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>, + + // Handle all scalar types as either i64 or f64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + CCIfType<[f16, f32], CCPromoteToType<f64>>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> +]>; + +// The WebKit_JS calling convention only passes the first argument (the callee) +// in register and the remaining arguments on stack. We allow 32bit stack slots, +// so that WebKit can write partial values in the stack and define the other +// 32bit quantity as undef. +def CC_AArch64_WebKit_JS : CallingConv<[ + // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, + + // Pass the remaining arguments on the stack instead. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +def RetCC_AArch64_WebKit_JS : CallingConv<[ + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM64 Calling Convention for GHC +//===----------------------------------------------------------------------===// + +// This calling convention is specific to the Glasgow Haskell Compiler. +// The only documentation is the GHC source code, specifically the C header +// file: +// +// https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h +// +// which defines the registers for the Spineless Tagless G-Machine (STG) that +// GHC uses to implement lazy evaluation. The generic STG machine has a set of +// registers which are mapped to appropriate set of architecture specific +// registers for each CPU architecture. +// +// The STG Machine is documented here: +// +// https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode +// +// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI +// register mapping". + +def CC_AArch64_GHC : CallingConv<[ + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType<v2f64>>, + + CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, + CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>, + CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>, + + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> +]>; + +// FIXME: LR is only callee-saved in the sense that *we* preserve it and are +// presumably a callee to someone. External functions may not do so, but this +// is currently safe since BL has LR as an implicit-def and what happens after a +// tail call doesn't matter. +// +// It would be better to model its preservation semantics properly (create a +// vreg on entry, use it in RET & tail call generation; make that vreg def if we +// end up saving LR as part of a call frame). Watch this space... +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since +// 'this' and the pointer return value are both passed in X0 in these cases, +// this can be partially modelled by treating X0 as a callee-saved register; +// only the resulting RegMask is used; the SaveList is ignored +// +// (For generic ARM 64-bit ABI code, clang will not generate constructors or +// destructors with 'this' returns, so this RegMask will not be used in that +// case) +def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_AArch64_TLS_Darwin + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_AArch64_TLS_Darwin, +// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. +def CSR_AArch64_CXX_TLS_Darwin + : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + +// The ELF stub used for TLS-descriptor access saves every feasible +// register. Only X0 and LR are clobbered. +def CSR_AArch64_TLS_ELF + : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP, + (sequence "Q%u", 0, 31))>; + +def CSR_AArch64_AllRegs + : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP, + (sequence "X%u", 0, 28), FP, LR, SP, + (sequence "B%u", 0, 31), (sequence "H%u", 0, 31), + (sequence "S%u", 0, 31), (sequence "D%u", 0, 31), + (sequence "Q%u", 0, 31))>; + +def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp new file mode 100644 index 0000000..9310ac4 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -0,0 +1,143 @@ +//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Local-dynamic access to thread-local variables proceeds in three stages. +// +// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated +// in much the same way as a general-dynamic TLS-descriptor access against +// the special symbol _TLS_MODULE_BASE. +// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using +// instructions with "dtprel" modifiers. +// 3. These two are added, together with TPIDR_EL0, to obtain the variable's +// true address. +// +// This is only better than general-dynamic access to the variable if two or +// more of the first stage TLS-descriptor calculations can be combined. This +// pass looks through a function and performs such combinations. +// +//===----------------------------------------------------------------------===// +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +using namespace llvm; + +namespace { +struct LDTLSCleanup : public MachineFunctionPass { + static char ID; + LDTLSCleanup() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + if (AFI->getNumLocalDynamicTLSAccesses() < 2) { + // No point folding accesses if there isn't at least two. + return false; + } + + MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + return VisitNode(DT->getRootNode(), 0); + } + + // Visit the dominator subtree rooted at Node in pre-order. + // If TLSBaseAddrReg is non-null, then use that to replace any + // TLS_base_addr instructions. Otherwise, create the register + // when the first such instruction is seen, and then use it + // as we encounter more instructions. + bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { + MachineBasicBlock *BB = Node->getBlock(); + bool Changed = false; + + // Traverse the current block. + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { + switch (I->getOpcode()) { + case AArch64::TLSDESC_CALLSEQ: + // Make sure it's a local dynamic access. + if (!I->getOperand(0).isSymbol() || + strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_")) + break; + + if (TLSBaseAddrReg) + I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg); + else + I = setRegister(I, &TLSBaseAddrReg); + Changed = true; + break; + default: + break; + } + } + + // Visit the children of this block in the dominator tree. + for (MachineDomTreeNode *N : *Node) { + Changed |= VisitNode(N, TLSBaseAddrReg); + } + + return Changed; + } + + // Replace the TLS_base_addr instruction I with a copy from + // TLSBaseAddrReg, returning the new instruction. + MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, + unsigned TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the + // code sequence assumes the address will be. + MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + AArch64::X0).addReg(TLSBaseAddrReg); + + // Erase the TLS_base_addr instruction. + I->eraseFromParent(); + + return Copy; + } + + // Create a virtal register in *TLSBaseAddrReg, and populate it by + // inserting a copy instruction after I. Returns the new instruction. + MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + + // Create a virtual register for the TLS base address. + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass); + + // Insert a copy from X0 to TLSBaseAddrReg for later. + MachineInstr *Copy = + BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) + .addReg(AArch64::X0); + + return Copy; + } + + const char *getPassName() const override { + return "Local Dynamic TLS Access Clean-up"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} + +char LDTLSCleanup::ID = 0; +FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() { + return new LDTLSCleanup(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp new file mode 100644 index 0000000..78c239b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -0,0 +1,1109 @@ +//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that collect the Linker Optimization Hint (LOH). +// This pass should be run at the very end of the compilation flow, just before +// assembly printer. +// To be useful for the linker, the LOH must be printed into the assembly file. +// +// A LOH describes a sequence of instructions that may be optimized by the +// linker. +// This same sequence cannot be optimized by the compiler because some of +// the information will be known at link time. +// For instance, consider the following sequence: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: ldr xC, [xB, #imm] +// This sequence can be turned into: +// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB: +// L3: ldr xC, sym+#imm +// It may also be turned into either the following more efficient +// code sequences: +// - If sym@PAGEOFF + #imm fits the encoding space of L3. +// L1: adrp xA, sym@PAGE +// L3: ldr xC, [xB, sym@PAGEOFF + #imm] +// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB: +// L1: adr xA, sym +// L3: ldr xC, [xB, #imm] +// +// To be valid a LOH must meet all the requirements needed by all the related +// possible linker transformations. +// For instance, using the running example, the constraints to emit +// ".loh AdrpAddLdr" are: +// - L1, L2, and L3 instructions are of the expected type, i.e., +// respectively ADRP, ADD (immediate), and LD. +// - The result of L1 is used only by L2. +// - The register argument (xA) used in the ADD instruction is defined +// only by L1. +// - The result of L2 is used only by L3. +// - The base address (xB) in L3 is defined only L2. +// - The ADRP in L1 and the ADD in L2 must reference the same symbol using +// @PAGE/@PAGEOFF with no additional constants +// +// Currently supported LOHs are: +// * So called non-ADRP-related: +// - .loh AdrpAddLdr L1, L2, L3: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: ldr xC, [xB, #imm] +// - .loh AdrpLdrGotLdr L1, L2, L3: +// L1: adrp xA, sym@GOTPAGE +// L2: ldr xB, [xA, sym@GOTPAGEOFF] +// L3: ldr xC, [xB, #imm] +// - .loh AdrpLdr L1, L3: +// L1: adrp xA, sym@PAGE +// L3: ldr xC, [xA, sym@PAGEOFF] +// - .loh AdrpAddStr L1, L2, L3: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: str xC, [xB, #imm] +// - .loh AdrpLdrGotStr L1, L2, L3: +// L1: adrp xA, sym@GOTPAGE +// L2: ldr xB, [xA, sym@GOTPAGEOFF] +// L3: str xC, [xB, #imm] +// - .loh AdrpAdd L1, L2: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// For all these LOHs, L1, L2, L3 form a simple chain: +// L1 result is used only by L2 and L2 result by L3. +// L3 LOH-related argument is defined only by L2 and L2 LOH-related argument +// by L1. +// All these LOHs aim at using more efficient load/store patterns by folding +// some instructions used to compute the address directly into the load/store. +// +// * So called ADRP-related: +// - .loh AdrpAdrp L2, L1: +// L2: ADRP xA, sym1@PAGE +// L1: ADRP xA, sym2@PAGE +// L2 dominates L1 and xA is not redifined between L2 and L1 +// This LOH aims at getting rid of redundant ADRP instructions. +// +// The overall design for emitting the LOHs is: +// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo. +// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it: +// 1. Associates them a label. +// 2. Emits them in a MCStreamer (EmitLOHDirective). +// - The MCMachOStreamer records them into the MCAssembler. +// - The MCAsmStreamer prints them. +// - Other MCStreamers ignore them. +// 3. Closes the MCStreamer: +// - The MachObjectWriter gets them from the MCAssembler and writes +// them in the object file. +// - Other ObjectWriters ignore them. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-collect-loh" + +static cl::opt<bool> +PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden, + cl::desc("Restrict analysis to registers invovled" + " in LOHs"), + cl::init(true)); + +static cl::opt<bool> +BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden, + cl::desc("Restrict analysis at basic block scope"), + cl::init(true)); + +STATISTIC(NumADRPSimpleCandidate, + "Number of simplifiable ADRP dominate by another"); +STATISTIC(NumADRPComplexCandidate2, + "Number of simplifiable ADRP reachable by 2 defs"); +STATISTIC(NumADRPComplexCandidate3, + "Number of simplifiable ADRP reachable by 3 defs"); +STATISTIC(NumADRPComplexCandidateOther, + "Number of simplifiable ADRP reachable by 4 or more defs"); +STATISTIC(NumADDToSTRWithImm, + "Number of simplifiable STR with imm reachable by ADD"); +STATISTIC(NumLDRToSTRWithImm, + "Number of simplifiable STR with imm reachable by LDR"); +STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD"); +STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR"); +STATISTIC(NumADDToLDRWithImm, + "Number of simplifiable LDR with imm reachable by ADD"); +STATISTIC(NumLDRToLDRWithImm, + "Number of simplifiable LDR with imm reachable by LDR"); +STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD"); +STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR"); +STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP"); +STATISTIC(NumCplxLvl1, "Number of complex case of level 1"); +STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1"); +STATISTIC(NumCplxLvl2, "Number of complex case of level 2"); +STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2"); +STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD"); +STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD"); + +namespace llvm { +void initializeAArch64CollectLOHPass(PassRegistry &); +} + +#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)" + +namespace { +struct AArch64CollectLOH : public MachineFunctionPass { + static char ID; + AArch64CollectLOH() : MachineFunctionPass(ID) { + initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return AARCH64_COLLECT_LOH_NAME; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineDominatorTree>(); + } + +private: +}; + +/// A set of MachineInstruction. +typedef SetVector<const MachineInstr *> SetOfMachineInstr; +/// Map a basic block to a set of instructions per register. +/// This is used to represent the exposed uses of a basic block +/// per register. +typedef MapVector<const MachineBasicBlock *, + std::unique_ptr<SetOfMachineInstr[]>> +BlockToSetOfInstrsPerColor; +/// Map a basic block to an instruction per register. +/// This is used to represent the live-out definitions of a basic block +/// per register. +typedef MapVector<const MachineBasicBlock *, + std::unique_ptr<const MachineInstr *[]>> +BlockToInstrPerColor; +/// Map an instruction to a set of instructions. Used to represent the +/// mapping def to reachable uses or use to definitions. +typedef MapVector<const MachineInstr *, SetOfMachineInstr> InstrToInstrs; +/// Map a basic block to a BitVector. +/// This is used to record the kill registers per basic block. +typedef MapVector<const MachineBasicBlock *, BitVector> BlockToRegSet; + +/// Map a register to a dense id. +typedef DenseMap<unsigned, unsigned> MapRegToId; +/// Map a dense id to a register. Used for debug purposes. +typedef SmallVector<unsigned, 32> MapIdToReg; +} // end anonymous namespace. + +char AArch64CollectLOH::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", + AARCH64_COLLECT_LOH_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", + AARCH64_COLLECT_LOH_NAME, false, false) + +/// Given a couple (MBB, reg) get the corresponding set of instruction from +/// the given "sets". +/// If this couple does not reference any set, an empty set is added to "sets" +/// for this couple and returned. +/// \param nbRegs is used internally allocate some memory. It must be consistent +/// with the way sets is used. +static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets, + const MachineBasicBlock &MBB, unsigned reg, + unsigned nbRegs) { + SetOfMachineInstr *result; + BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB); + if (it != sets.end()) + result = it->second.get(); + else + result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get(); + + return result[reg]; +} + +/// Given a couple (reg, MI) get the corresponding set of instructions from the +/// the given "sets". +/// This is used to get the uses record in sets of a definition identified by +/// MI and reg, i.e., MI defines reg. +/// If the couple does not reference anything, an empty set is added to +/// "sets[reg]". +/// \pre set[reg] is valid. +static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg, + const MachineInstr &MI) { + return sets[reg][&MI]; +} + +/// Same as getUses but does not modify the input map: sets. +/// \return NULL if the couple (reg, MI) is not in sets. +static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg, + const MachineInstr &MI) { + InstrToInstrs::const_iterator Res = sets[reg].find(&MI); + if (Res != sets[reg].end()) + return &(Res->second); + return nullptr; +} + +/// Initialize the reaching definition algorithm: +/// For each basic block BB in MF, record: +/// - its kill set. +/// - its reachable uses (uses that are exposed to BB's predecessors). +/// - its the generated definitions. +/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to +/// the list of uses of exposed defintions. +/// \param ADRPMode specifies to only consider ADRP instructions for generated +/// definition. It also consider definitions of ADRP instructions as uses and +/// ignore other uses. The ADRPMode is used to collect the information for LHO +/// that involve ADRP operation only. +static void initReachingDef(const MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + BlockToInstrPerColor &Gen, BlockToRegSet &Kill, + BlockToSetOfInstrsPerColor &ReachableUses, + const MapRegToId &RegToId, + const MachineInstr *DummyOp, bool ADRPMode) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + unsigned NbReg = RegToId.size(); + + for (const MachineBasicBlock &MBB : MF) { + auto &BBGen = Gen[&MBB]; + BBGen = make_unique<const MachineInstr *[]>(NbReg); + std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr); + + BitVector &BBKillSet = Kill[&MBB]; + BBKillSet.resize(NbReg); + for (const MachineInstr &MI : MBB) { + bool IsADRP = MI.getOpcode() == AArch64::ADRP; + + // Process uses first. + if (IsADRP || !ADRPMode) + for (const MachineOperand &MO : MI.operands()) { + // Treat ADRP def as use, as the goal of the analysis is to find + // ADRP defs reached by other ADRP defs. + if (!MO.isReg() || (!ADRPMode && !MO.isUse()) || + (ADRPMode && (!IsADRP || !MO.isDef()))) + continue; + unsigned CurReg = MO.getReg(); + MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); + if (ItCurRegId == RegToId.end()) + continue; + CurReg = ItCurRegId->second; + + // if CurReg has not been defined, this use is reachable. + if (!BBGen[CurReg] && !BBKillSet.test(CurReg)) + getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI); + // current basic block definition for this color, if any, is in Gen. + if (BBGen[CurReg]) + getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI); + } + + // Process clobbers. + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + // Clobbers kill the related colors. + const uint32_t *PreservedRegs = MO.getRegMask(); + + // Set generated regs. + for (const auto &Entry : RegToId) { + unsigned Reg = Entry.second; + // Use the global register ID when querying APIs external to this + // pass. + if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) { + // Do not register clobbered definition for no ADRP. + // This definition is not used anyway (otherwise register + // allocation is wrong). + BBGen[Reg] = ADRPMode ? &MI : nullptr; + BBKillSet.set(Reg); + } + } + } + + // Process register defs. + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned CurReg = MO.getReg(); + MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); + if (ItCurRegId == RegToId.end()) + continue; + + for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { + MapRegToId::const_iterator ItRegId = RegToId.find(*AI); + // If this alias has not been recorded, then it is not interesting + // for the current analysis. + // We can end up in this situation because of tuple registers. + // E.g., Let say we are interested in S1. When we register + // S1, we will also register its aliases and in particular + // the tuple Q1_Q2. + // Now, when we encounter Q1_Q2, we will look through its aliases + // and will find that S2 is not registered. + if (ItRegId == RegToId.end()) + continue; + + BBKillSet.set(ItRegId->second); + BBGen[ItRegId->second] = &MI; + } + BBGen[ItCurRegId->second] = &MI; + } + } + + // If we restrict our analysis to basic block scope, conservatively add a + // dummy + // use for each generated value. + if (!ADRPMode && DummyOp && !MBB.succ_empty()) + for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) + if (BBGen[CurReg]) + getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp); + } +} + +/// Reaching def core algorithm: +/// while an Out has changed +/// for each bb +/// for each color +/// In[bb][color] = U Out[bb.predecessors][color] +/// insert reachableUses[bb][color] in each in[bb][color] +/// op.reachedUses +/// +/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) +static void reachingDefAlgorithm(const MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + BlockToSetOfInstrsPerColor &In, + BlockToSetOfInstrsPerColor &Out, + BlockToInstrPerColor &Gen, BlockToRegSet &Kill, + BlockToSetOfInstrsPerColor &ReachableUses, + unsigned NbReg) { + bool HasChanged; + do { + HasChanged = false; + for (const MachineBasicBlock &MBB : MF) { + unsigned CurReg; + for (CurReg = 0; CurReg < NbReg; ++CurReg) { + SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg); + SetOfMachineInstr &BBReachableUses = + getSet(ReachableUses, MBB, CurReg, NbReg); + SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg); + unsigned Size = BBOutSet.size(); + // In[bb][color] = U Out[bb.predecessors][color] + for (const MachineBasicBlock *PredMBB : MBB.predecessors()) { + SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg); + BBInSet.insert(PredOutSet.begin(), PredOutSet.end()); + } + // insert reachableUses[bb][color] in each in[bb][color] op.reachedses + for (const MachineInstr *MI : BBInSet) { + SetOfMachineInstr &OpReachedUses = + getUses(ColorOpToReachedUses, CurReg, *MI); + OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end()); + } + // Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) + if (!Kill[&MBB].test(CurReg)) + BBOutSet.insert(BBInSet.begin(), BBInSet.end()); + if (Gen[&MBB][CurReg]) + BBOutSet.insert(Gen[&MBB][CurReg]); + HasChanged |= BBOutSet.size() != Size; + } + } + } while (HasChanged); +} + +/// Reaching definition algorithm. +/// \param MF function on which the algorithm will operate. +/// \param[out] ColorOpToReachedUses will contain the result of the reaching +/// def algorithm. +/// \param ADRPMode specify whether the reaching def algorithm should be tuned +/// for ADRP optimization. \see initReachingDef for more details. +/// \param DummyOp if not NULL, the algorithm will work at +/// basic block scope and will set for every exposed definition a use to +/// @p DummyOp. +/// \pre ColorOpToReachedUses is an array of at least number of registers of +/// InstrToInstrs. +static void reachingDef(const MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + const MapRegToId &RegToId, bool ADRPMode = false, + const MachineInstr *DummyOp = nullptr) { + // structures: + // For each basic block. + // Out: a set per color of definitions that reach the + // out boundary of this block. + // In: Same as Out but for in boundary. + // Gen: generated color in this block (one operation per color). + // Kill: register set of killed color in this block. + // ReachableUses: a set per color of uses (operation) reachable + // for "In" definitions. + BlockToSetOfInstrsPerColor Out, In, ReachableUses; + BlockToInstrPerColor Gen; + BlockToRegSet Kill; + + // Initialize Gen, kill and reachableUses. + initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId, + DummyOp, ADRPMode); + + // Algo. + if (!DummyOp) + reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill, + ReachableUses, RegToId.size()); +} + +#ifndef NDEBUG +/// print the result of the reaching definition algorithm. +static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses, + unsigned NbReg, const TargetRegisterInfo *TRI, + const MapIdToReg &IdToReg) { + unsigned CurReg; + for (CurReg = 0; CurReg < NbReg; ++CurReg) { + if (ColorOpToReachedUses[CurReg].empty()) + continue; + DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n"); + + for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { + DEBUG(dbgs() << "Def:\n"); + DEBUG(DefsIt.first->print(dbgs())); + DEBUG(dbgs() << "Reachable uses:\n"); + for (const MachineInstr *MI : DefsIt.second) { + DEBUG(MI->print(dbgs())); + } + } + } +} +#endif // NDEBUG + +/// Answer the following question: Can Def be one of the definition +/// involved in a part of a LOH? +static bool canDefBePartOfLOH(const MachineInstr *Def) { + unsigned Opc = Def->getOpcode(); + // Accept ADRP, ADDLow and LOADGot. + switch (Opc) { + default: + return false; + case AArch64::ADRP: + return true; + case AArch64::ADDXri: + // Check immediate to see if the immediate is an address. + switch (Def->getOperand(2).getType()) { + default: + return false; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_BlockAddress: + return true; + } + case AArch64::LDRXui: + // Check immediate to see if the immediate is an address. + switch (Def->getOperand(2).getType()) { + default: + return false; + case MachineOperand::MO_GlobalAddress: + return true; + } + } + // Unreachable. + return false; +} + +/// Check whether the given instruction can the end of a LOH chain involving a +/// store. +static bool isCandidateStore(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::STRBBui: + case AArch64::STRHHui: + case AArch64::STRBui: + case AArch64::STRHui: + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + // In case we have str xA, [xA, #imm], this is two different uses + // of xA and we cannot fold, otherwise the xA stored may be wrong, + // even if #imm == 0. + if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg()) + return true; + } + return false; +} + +/// Given the result of a reaching definition algorithm in ColorOpToReachedUses, +/// Build the Use to Defs information and filter out obvious non-LOH candidates. +/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions. +/// In non-ADRPMode, non-LOH candidates are "uses" with several definition, +/// i.e., no simple chain. +/// \param ADRPMode -- \see initReachingDef. +static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs, + const InstrToInstrs *ColorOpToReachedUses, + const MapRegToId &RegToId, + bool ADRPMode = false) { + + SetOfMachineInstr NotCandidate; + unsigned NbReg = RegToId.size(); + MapRegToId::const_iterator EndIt = RegToId.end(); + for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) { + // If this color is never defined, continue. + if (ColorOpToReachedUses[CurReg].empty()) + continue; + + for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { + for (const MachineInstr *MI : DefsIt.second) { + const MachineInstr *Def = DefsIt.first; + MapRegToId::const_iterator It; + // if all the reaching defs are not adrp, this use will not be + // simplifiable. + if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) || + (!ADRPMode && !canDefBePartOfLOH(Def)) || + (!ADRPMode && isCandidateStore(MI) && + // store are LOH candidate iff the end of the chain is used as + // base. + ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt || + It->second != CurReg))) { + NotCandidate.insert(MI); + continue; + } + // Do not consider self reaching as a simplifiable case for ADRP. + if (!ADRPMode || MI != DefsIt.first) { + UseToReachingDefs[MI].insert(DefsIt.first); + // If UsesIt has several reaching definitions, it is not + // candidate for simplificaton in non-ADRPMode. + if (!ADRPMode && UseToReachingDefs[MI].size() > 1) + NotCandidate.insert(MI); + } + } + } + } + for (const MachineInstr *Elem : NotCandidate) { + DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n"); + // It would have been better if we could just remove the entry + // from the map. Because of that, we have to filter the garbage + // (second.empty) in the subsequence analysis. + UseToReachingDefs[Elem].clear(); + } +} + +/// Based on the use to defs information (in ADRPMode), compute the +/// opportunities of LOH ADRP-related. +static void computeADRP(const InstrToInstrs &UseToDefs, + AArch64FunctionInfo &AArch64FI, + const MachineDominatorTree *MDT) { + DEBUG(dbgs() << "*** Compute LOH for ADRP\n"); + for (const auto &Entry : UseToDefs) { + unsigned Size = Entry.second.size(); + if (Size == 0) + continue; + if (Size == 1) { + const MachineInstr *L2 = *Entry.second.begin(); + const MachineInstr *L1 = Entry.first; + if (!MDT->dominates(L2, L1)) { + DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1 + << '\n'); + continue; + } + DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); + SmallVector<const MachineInstr *, 2> Args; + Args.push_back(L2); + Args.push_back(L1); + AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args); + ++NumADRPSimpleCandidate; + } +#ifdef DEBUG + else if (Size == 2) + ++NumADRPComplexCandidate2; + else if (Size == 3) + ++NumADRPComplexCandidate3; + else + ++NumADRPComplexCandidateOther; +#endif + // if Size < 1, the use should have been removed from the candidates + assert(Size >= 1 && "No reaching defs for that use!"); + } +} + +/// Check whether the given instruction can be the end of a LOH chain +/// involving a load. +static bool isCandidateLoad(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::LDRSBWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHWui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRBui: + case AArch64::LDRHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT) + return false; + return true; + } + // Unreachable. + return false; +} + +/// Check whether the given instruction can load a litteral. +static bool supportLoadFromLiteral(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::LDRSWui: + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + return true; + } + // Unreachable. + return false; +} + +/// Check whether the given instruction is a LOH candidate. +/// \param UseToDefs is used to check that Instr is at the end of LOH supported +/// chain. +/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are +/// already been filtered out. +static bool isCandidate(const MachineInstr *Instr, + const InstrToInstrs &UseToDefs, + const MachineDominatorTree *MDT) { + if (!isCandidateLoad(Instr) && !isCandidateStore(Instr)) + return false; + + const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin(); + if (Def->getOpcode() != AArch64::ADRP) { + // At this point, Def is ADDXri or LDRXui of the right type of + // symbol, because we filtered out the uses that were not defined + // by these kind of instructions (+ ADRP). + + // Check if this forms a simple chain: each intermediate node must + // dominates the next one. + if (!MDT->dominates(Def, Instr)) + return false; + // Move one node up in the simple chain. + if (UseToDefs.find(Def) == + UseToDefs.end() + // The map may contain garbage we have to ignore. + || + UseToDefs.find(Def)->second.empty()) + return false; + Instr = Def; + Def = *UseToDefs.find(Def)->second.begin(); + } + // Check if we reached the top of the simple chain: + // - top is ADRP. + // - check the simple chain property: each intermediate node must + // dominates the next one. + if (Def->getOpcode() == AArch64::ADRP) + return MDT->dominates(Def, Instr); + return false; +} + +static bool registerADRCandidate(const MachineInstr &Use, + const InstrToInstrs &UseToDefs, + const InstrToInstrs *DefsPerColorToUses, + AArch64FunctionInfo &AArch64FI, + SetOfMachineInstr *InvolvedInLOHs, + const MapRegToId &RegToId) { + // Look for opportunities to turn ADRP -> ADD or + // ADRP -> LDR GOTPAGEOFF into ADR. + // If ADRP has more than one use. Give up. + if (Use.getOpcode() != AArch64::ADDXri && + (Use.getOpcode() != AArch64::LDRXui || + !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT))) + return false; + InstrToInstrs::const_iterator It = UseToDefs.find(&Use); + // The map may contain garbage that we need to ignore. + if (It == UseToDefs.end() || It->second.empty()) + return false; + const MachineInstr &Def = **It->second.begin(); + if (Def.getOpcode() != AArch64::ADRP) + return false; + // Check the number of users of ADRP. + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def.getOperand(0).getReg())->second, Def); + if (Users->size() > 1) { + ++NumADRComplexCandidate; + return false; + } + ++NumADRSimpleCandidate; + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) && + "ADRP already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) && + "ADD already involved in LOH."); + DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); + + SmallVector<const MachineInstr *, 2> Args; + Args.push_back(&Def); + Args.push_back(&Use); + + AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd + : MCLOH_AdrpLdrGot, + Args); + return true; +} + +/// Based on the use to defs information (in non-ADRPMode), compute the +/// opportunities of LOH non-ADRP-related +static void computeOthers(const InstrToInstrs &UseToDefs, + const InstrToInstrs *DefsPerColorToUses, + AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId, + const MachineDominatorTree *MDT) { + SetOfMachineInstr *InvolvedInLOHs = nullptr; +#ifdef DEBUG + SetOfMachineInstr InvolvedInLOHsStorage; + InvolvedInLOHs = &InvolvedInLOHsStorage; +#endif // DEBUG + DEBUG(dbgs() << "*** Compute LOH for Others\n"); + // ADRP -> ADD/LDR -> LDR/STR pattern. + // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern. + + // FIXME: When the statistics are not important, + // This initial filtering loop can be merged into the next loop. + // Currently, we didn't do it to have the same code for both DEBUG and + // NDEBUG builds. Indeed, the iterator of the second loop would need + // to be changed. + SetOfMachineInstr PotentialCandidates; + SetOfMachineInstr PotentialADROpportunities; + for (auto &Use : UseToDefs) { + // If no definition is available, this is a non candidate. + if (Use.second.empty()) + continue; + // Keep only instructions that are load or store and at the end of + // a ADRP -> ADD/LDR/Nothing chain. + // We already filtered out the no-chain cases. + if (!isCandidate(Use.first, UseToDefs, MDT)) { + PotentialADROpportunities.insert(Use.first); + continue; + } + PotentialCandidates.insert(Use.first); + } + + // Make the following distinctions for statistics as the linker does + // know how to decode instructions: + // - ADD/LDR/Nothing make there different patterns. + // - LDR/STR make two different patterns. + // Hence, 6 - 1 base patterns. + // (because ADRP-> Nothing -> STR is not simplifiable) + + // The linker is only able to have a simple semantic, i.e., if pattern A + // do B. + // However, we want to see the opportunity we may miss if we were able to + // catch more complex cases. + + // PotentialCandidates are result of a chain ADRP -> ADD/LDR -> + // A potential candidate becomes a candidate, if its current immediate + // operand is zero and all nodes of the chain have respectively only one user +#ifdef DEBUG + SetOfMachineInstr DefsOfPotentialCandidates; +#endif + for (const MachineInstr *Candidate : PotentialCandidates) { + // Get the definition of the candidate i.e., ADD or LDR. + const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin(); + // Record the elements of the chain. + const MachineInstr *L1 = Def; + const MachineInstr *L2 = nullptr; + unsigned ImmediateDefOpc = Def->getOpcode(); + if (Def->getOpcode() != AArch64::ADRP) { + // Check the number of users of this node. + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def->getOperand(0).getReg())->second, *Def); + if (Users->size() > 1) { +#ifdef DEBUG + // if all the uses of this def are in potential candidate, this is + // a complex candidate of level 2. + bool IsLevel2 = true; + for (const MachineInstr *MI : *Users) { + if (!PotentialCandidates.count(MI)) { + ++NumTooCplxLvl2; + IsLevel2 = false; + break; + } + } + if (IsLevel2) + ++NumCplxLvl2; +#endif // DEBUG + PotentialADROpportunities.insert(Def); + continue; + } + L2 = Def; + Def = *UseToDefs.find(Def)->second.begin(); + L1 = Def; + } // else the element in the middle of the chain is nothing, thus + // Def already contains the first element of the chain. + + // Check the number of users of the first node in the chain, i.e., ADRP + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def->getOperand(0).getReg())->second, *Def); + if (Users->size() > 1) { +#ifdef DEBUG + // if all the uses of this def are in the defs of the potential candidate, + // this is a complex candidate of level 1 + if (DefsOfPotentialCandidates.empty()) { + // lazy init + DefsOfPotentialCandidates = PotentialCandidates; + for (const MachineInstr *Candidate : PotentialCandidates) { + if (!UseToDefs.find(Candidate)->second.empty()) + DefsOfPotentialCandidates.insert( + *UseToDefs.find(Candidate)->second.begin()); + } + } + bool Found = false; + for (auto &Use : *Users) { + if (!DefsOfPotentialCandidates.count(Use)) { + ++NumTooCplxLvl1; + Found = true; + break; + } + } + if (!Found) + ++NumCplxLvl1; +#endif // DEBUG + continue; + } + + bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); + // If the chain is three instructions long and ldr is the second element, + // then this ldr must load form GOT, otherwise this is not a correct chain. + if (L2 && !IsL2Add && + !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)) + continue; + SmallVector<const MachineInstr *, 3> Args; + MCLOHType Kind; + if (isCandidateLoad(Candidate)) { + if (!L2) { + // At this point, the candidate LOH indicates that the ldr instruction + // may use a direct access to the symbol. There is not such encoding + // for loads of byte and half. + if (!supportLoadFromLiteral(Candidate)) + continue; + + DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate + << '\n'); + Kind = MCLOH_AdrpLdr; + Args.push_back(L1); + Args.push_back(Candidate); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); + ++NumADRPToLDR; + } else { + DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") + << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate + << '\n'); + + Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr; + Args.push_back(L1); + Args.push_back(L2); + Args.push_back(Candidate); + + PotentialADROpportunities.remove(L2); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && + "L2 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); +#ifdef DEBUG + // get the immediate of the load + if (Candidate->getOperand(2).getImm() == 0) + if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToLDR; + else + ++NumLDRToLDR; + else if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToLDRWithImm; + else + ++NumLDRToLDRWithImm; +#endif // DEBUG + } + } else { + if (ImmediateDefOpc == AArch64::ADRP) + continue; + else { + + DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") + << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate + << '\n'); + + Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr; + Args.push_back(L1); + Args.push_back(L2); + Args.push_back(Candidate); + + PotentialADROpportunities.remove(L2); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && + "L2 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); +#ifdef DEBUG + // get the immediate of the store + if (Candidate->getOperand(2).getImm() == 0) + if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToSTR; + else + ++NumLDRToSTR; + else if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToSTRWithImm; + else + ++NumLDRToSTRWithImm; +#endif // DEBUG + } + } + AArch64FI.addLOHDirective(Kind, Args); + } + + // Now, we grabbed all the big patterns, check ADR opportunities. + for (const MachineInstr *Candidate : PotentialADROpportunities) + registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI, + InvolvedInLOHs, RegToId); +} + +/// Look for every register defined by potential LOHs candidates. +/// Map these registers with dense id in @p RegToId and vice-versa in +/// @p IdToReg. @p IdToReg is populated only in DEBUG mode. +static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, + MapIdToReg &IdToReg, + const TargetRegisterInfo *TRI) { + unsigned CurRegId = 0; + if (!PreCollectRegister) { + unsigned NbReg = TRI->getNumRegs(); + for (; CurRegId < NbReg; ++CurRegId) { + RegToId[CurRegId] = CurRegId; + DEBUG(IdToReg.push_back(CurRegId)); + DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches")); + } + return; + } + + DEBUG(dbgs() << "** Collect Involved Register\n"); + for (const auto &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (!canDefBePartOfLOH(&MI) && + !isCandidateLoad(&MI) && !isCandidateStore(&MI)) + continue; + + // Process defs + for (MachineInstr::const_mop_iterator IO = MI.operands_begin(), + IOEnd = MI.operands_end(); + IO != IOEnd; ++IO) { + if (!IO->isReg() || !IO->isDef()) + continue; + unsigned CurReg = IO->getReg(); + for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) + if (RegToId.find(*AI) == RegToId.end()) { + DEBUG(IdToReg.push_back(*AI); + assert(IdToReg[CurRegId] == *AI && + "Reg index mismatches insertion index.")); + RegToId[*AI] = CurRegId++; + DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n'); + } + } + } + } +} + +bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); + + MapRegToId RegToId; + MapIdToReg IdToReg; + AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>(); + assert(AArch64FI && "No MachineFunctionInfo for this function!"); + + DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n'); + + collectInvolvedReg(MF, RegToId, IdToReg, TRI); + if (RegToId.empty()) + return false; + + MachineInstr *DummyOp = nullptr; + if (BasicBlockScopeOnly) { + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + // For local analysis, create a dummy operation to record uses that are not + // local. + DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc()); + } + + unsigned NbReg = RegToId.size(); + bool Modified = false; + + // Start with ADRP. + InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg]; + + // Compute the reaching def in ADRP mode, meaning ADRP definitions + // are first considered as uses. + reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp); + DEBUG(dbgs() << "ADRP reaching defs\n"); + DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); + + // Translate the definition to uses map into a use to definitions map to ease + // statistic computation. + InstrToInstrs ADRPToReachingDefs; + reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true); + + // Compute LOH for ADRP. + computeADRP(ADRPToReachingDefs, *AArch64FI, MDT); + delete[] ColorOpToReachedUses; + + // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern. + ColorOpToReachedUses = new InstrToInstrs[NbReg]; + + // first perform a regular reaching def analysis. + reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp); + DEBUG(dbgs() << "All reaching defs\n"); + DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); + + // Turn that into a use to defs to ease statistic computation. + InstrToInstrs UsesToReachingDefs; + reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false); + + // Compute other than AdrpAdrp LOH. + computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId, + MDT); + delete[] ColorOpToReachedUses; + + if (BasicBlockScopeOnly) + MF.DeleteMachineInstr(DummyOp); + + return Modified; +} + +/// createAArch64CollectLOHPass - returns an instance of the Statistic for +/// linker optimization pass. +FunctionPass *llvm::createAArch64CollectLOHPass() { + return new AArch64CollectLOH(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp new file mode 100644 index 0000000..fc27bfe --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -0,0 +1,430 @@ +//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to make consecutive compares of values use same operands to +// allow CSE pass to remove duplicated instructions. For this it analyzes +// branches and adjusts comparisons with immediate values by converting: +// * GE -> GT +// * GT -> GE +// * LT -> LE +// * LE -> LT +// and adjusting immediate values appropriately. It basically corrects two +// immediate values towards each other to make them equal. +// +// Consider the following example in C: +// +// if ((a < 5 && ...) || (a > 5 && ...)) { +// ~~~~~ ~~~~~ +// ^ ^ +// x y +// +// Here both "x" and "y" expressions compare "a" with "5". When "x" evaluates +// to "false", "y" can just check flags set by the first comparison. As a +// result of the canonicalization employed by +// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific +// code, assembly ends up in the form that is not CSE friendly: +// +// ... +// cmp w8, #4 +// b.gt .LBB0_3 +// ... +// .LBB0_3: +// cmp w8, #6 +// b.lt .LBB0_6 +// ... +// +// Same assembly after the pass: +// +// ... +// cmp w8, #5 +// b.ge .LBB0_3 +// ... +// .LBB0_3: +// cmp w8, #5 // <-- CSE pass removes this instruction +// b.le .LBB0_6 +// ... +// +// Currently only SUBS and ADDS followed by b.?? are supported. +// +// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0" +// TODO: handle other conditional instructions (e.g. CSET) +// TODO: allow second branching to be anything if it doesn't require adjusting +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cstdlib> +#include <tuple> + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-condopt" + +STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted"); + +namespace { +class AArch64ConditionOptimizer : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineDominatorTree *DomTree; + const MachineRegisterInfo *MRI; + +public: + // Stores immediate, compare instruction opcode and branch condition (in this + // order) of adjusted comparison. + typedef std::tuple<int, unsigned, AArch64CC::CondCode> CmpInfo; + + static char ID; + AArch64ConditionOptimizer() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + MachineInstr *findSuitableCompare(MachineBasicBlock *MBB); + CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp); + void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info); + bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To, + int ToImm); + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 Condition Optimizer"; + } +}; +} // end anonymous namespace + +char AArch64ConditionOptimizer::ID = 0; + +namespace llvm { +void initializeAArch64ConditionOptimizerPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt", + "AArch64 CondOpt Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt", + "AArch64 CondOpt Pass", false, false) + +FunctionPass *llvm::createAArch64ConditionOptimizerPass() { + return new AArch64ConditionOptimizer(); +} + +void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +// Finds compare instruction that corresponds to supported types of branching. +// Returns the instruction or nullptr on failures or detecting unsupported +// instructions. +MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( + MachineBasicBlock *MBB) { + MachineBasicBlock::iterator I = MBB->getFirstTerminator(); + if (I == MBB->end()) + return nullptr; + + if (I->getOpcode() != AArch64::Bcc) + return nullptr; + + // Now find the instruction controlling the terminator. + for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { + --I; + assert(!I->isTerminator() && "Spurious terminator"); + switch (I->getOpcode()) { + // cmp is an alias for subs with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + // cmn is an alias for adds with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); + if (!I->getOperand(2).isImm()) { + DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + return nullptr; + } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { + DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n'); + return nullptr; + } else if (!MRI->use_empty(I->getOperand(0).getReg())) { + DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + return nullptr; + } + return I; + } + // Prevent false positive case like: + // cmp w19, #0 + // cinc w0, w19, gt + // ... + // fcmp d8, #0.0 + // b.gt .LBB0_5 + case AArch64::FCMPDri: + case AArch64::FCMPSri: + case AArch64::FCMPESri: + case AArch64::FCMPEDri: + + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::FCMPSrr: + case AArch64::FCMPDrr: + case AArch64::FCMPESrr: + case AArch64::FCMPEDrr: + // Skip comparison instructions without immediate operands. + return nullptr; + } + } + DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + return nullptr; +} + +// Changes opcode adds <-> subs considering register operand width. +static int getComplementOpc(int Opc) { + switch (Opc) { + case AArch64::ADDSWri: return AArch64::SUBSWri; + case AArch64::ADDSXri: return AArch64::SUBSXri; + case AArch64::SUBSWri: return AArch64::ADDSWri; + case AArch64::SUBSXri: return AArch64::ADDSXri; + default: + llvm_unreachable("Unexpected opcode"); + } +} + +// Changes form of comparison inclusive <-> exclusive. +static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) { + switch (Cmp) { + case AArch64CC::GT: return AArch64CC::GE; + case AArch64CC::GE: return AArch64CC::GT; + case AArch64CC::LT: return AArch64CC::LE; + case AArch64CC::LE: return AArch64CC::LT; + default: + llvm_unreachable("Unexpected condition code"); + } +} + +// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison +// operator and condition code. +AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp( + MachineInstr *CmpMI, AArch64CC::CondCode Cmp) { + unsigned Opc = CmpMI->getOpcode(); + + // CMN (compare with negative immediate) is an alias to ADDS (as + // "operand - negative" == "operand + positive") + bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri); + + int Correction = (Cmp == AArch64CC::GT) ? 1 : -1; + // Negate Correction value for comparison with negative immediate (CMN). + if (Negative) { + Correction = -Correction; + } + + const int OldImm = (int)CmpMI->getOperand(2).getImm(); + const int NewImm = std::abs(OldImm + Correction); + + // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by + // adjusting compare instruction opcode. + if (OldImm == 0 && ((Negative && Correction == 1) || + (!Negative && Correction == -1))) { + Opc = getComplementOpc(Opc); + } + + return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp)); +} + +// Applies changes to comparison instruction suggested by adjustCmp(). +void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, + const CmpInfo &Info) { + int Imm; + unsigned Opc; + AArch64CC::CondCode Cmp; + std::tie(Imm, Opc, Cmp) = Info; + + MachineBasicBlock *const MBB = CmpMI->getParent(); + + // Change immediate in comparison instruction (ADDS or SUBS). + BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc)) + .addOperand(CmpMI->getOperand(0)) + .addOperand(CmpMI->getOperand(1)) + .addImm(Imm) + .addOperand(CmpMI->getOperand(3)); + CmpMI->eraseFromParent(); + + // The fact that this comparison was picked ensures that it's related to the + // first terminator instruction. + MachineInstr *BrMI = MBB->getFirstTerminator(); + + // Change condition in branch instruction. + BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(Cmp) + .addOperand(BrMI->getOperand(1)); + BrMI->eraseFromParent(); + + MBB->updateTerminator(); + + ++NumConditionsAdjusted; +} + +// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// corresponding to TBB. +// Returns true if parsing was successful, otherwise false is returned. +static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { + // A normal br.cond simply has the condition code. + if (Cond[0].getImm() != -1) { + assert(Cond.size() == 1 && "Unknown Cond array format"); + CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + return true; + } + return false; +} + +// Adjusts one cmp instruction to another one if result of adjustment will allow +// CSE. Returns true if compare instruction was changed, otherwise false is +// returned. +bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, + AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm) +{ + CmpInfo Info = adjustCmp(CmpMI, Cmp); + if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) { + modifyCmp(CmpMI, Info); + return true; + } + return false; +} + +bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" + << "********** Function: " << MF.getName() << '\n'); + TII = MF.getSubtarget().getInstrInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + MRI = &MF.getRegInfo(); + + bool Changed = false; + + // Visit blocks in dominator tree pre-order. The pre-order enables multiple + // cmp-conversions from the same head block. + // Note that updateDomTree() modifies the children of the DomTree node + // currently being visited. The df_iterator supports that; it doesn't look at + // child_begin() / child_end() until after a node has been visited. + for (MachineDomTreeNode *I : depth_first(DomTree)) { + MachineBasicBlock *HBB = I->getBlock(); + + SmallVector<MachineOperand, 4> HeadCond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) { + continue; + } + + // Equivalence check is to skip loops. + if (!TBB || TBB == HBB) { + continue; + } + + SmallVector<MachineOperand, 4> TrueCond; + MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; + if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { + continue; + } + + MachineInstr *HeadCmpMI = findSuitableCompare(HBB); + if (!HeadCmpMI) { + continue; + } + + MachineInstr *TrueCmpMI = findSuitableCompare(TBB); + if (!TrueCmpMI) { + continue; + } + + AArch64CC::CondCode HeadCmp; + if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) { + continue; + } + + AArch64CC::CondCode TrueCmp; + if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) { + continue; + } + + const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm(); + const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm(); + + DEBUG(dbgs() << "Head branch:\n"); + DEBUG(dbgs() << "\tcondition: " + << AArch64CC::getCondCodeName(HeadCmp) << '\n'); + DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n'); + + DEBUG(dbgs() << "True branch:\n"); + DEBUG(dbgs() << "\tcondition: " + << AArch64CC::getCondCodeName(TrueCmp) << '\n'); + DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n'); + + if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) && + std::abs(TrueImm - HeadImm) == 2) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...) + // + // into + // + // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...) + // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...) + + CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp); + CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp); + if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) && + std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) { + modifyCmp(HeadCmpMI, HeadCmpInfo); + modifyCmp(TrueCmpMI, TrueCmpInfo); + Changed = true; + } + } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) && + std::abs(TrueImm - HeadImm) == 1) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...) + // + // into + // + // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...) + // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...) + + // GT -> GE transformation increases immediate value, so picking the + // smaller one; LT -> LE decreases immediate value so invert the choice. + bool adjustHeadCond = (HeadImm < TrueImm); + if (HeadCmp == AArch64CC::LT) { + adjustHeadCond = !adjustHeadCond; + } + + if (adjustHeadCond) { + Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm); + } else { + Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm); + } + } + // Other transformation cases almost never occur due to generation of < or > + // comparisons instead of <= and >=. + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp new file mode 100644 index 0000000..df1320f --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -0,0 +1,917 @@ +//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64ConditionalCompares pass which reduces +// branching and code size by using the conditional compare instructions CCMP, +// CCMN, and FCMP. +// +// The CFG transformations for forming conditional compares are very similar to +// if-conversion, and this pass should run immediately before the early +// if-conversion pass. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SparseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ccmp" + +// Absolute maximum number of instructions allowed per speculated block. +// This bypasses all other heuristics, so it should be set fairly high. +static cl::opt<unsigned> BlockInstrLimit( + "aarch64-ccmp-limit", cl::init(30), cl::Hidden, + cl::desc("Maximum number of instructions per speculated block.")); + +// Stress testing mode - disable heuristics. +static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden, + cl::desc("Turn all knobs to 11")); + +STATISTIC(NumConsidered, "Number of ccmps considered"); +STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)"); +STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)"); +STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)"); +STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)"); +STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)"); +STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)"); +STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)"); +STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)"); +STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)"); +STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)"); + +STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)"); + +STATISTIC(NumConverted, "Number of ccmp instructions created"); +STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted"); + +//===----------------------------------------------------------------------===// +// SSACCmpConv +//===----------------------------------------------------------------------===// +// +// The SSACCmpConv class performs ccmp-conversion on SSA form machine code +// after determining if it is possible. The class contains no heuristics; +// external code should be used to determine when ccmp-conversion is a good +// idea. +// +// CCmp-formation works on a CFG representing chained conditions, typically +// from C's short-circuit || and && operators: +// +// From: Head To: Head +// / | CmpBB +// / | / | +// | CmpBB / | +// | / | Tail | +// | / | | | +// Tail | | | +// | | | | +// ... ... ... ... +// +// The Head block is terminated by a br.cond instruction, and the CmpBB block +// contains compare + br.cond. Tail must be a successor of both. +// +// The cmp-conversion turns the compare instruction in CmpBB into a conditional +// compare, and merges CmpBB into Head, speculatively executing its +// instructions. The AArch64 conditional compare instructions have an immediate +// operand that specifies the NZCV flag values when the condition is false and +// the compare isn't executed. This makes it possible to chain compares with +// different condition codes. +// +// Example: +// +// if (a == 5 || b == 17) +// foo(); +// +// Head: +// cmp w0, #5 +// b.eq Tail +// CmpBB: +// cmp w1, #17 +// b.eq Tail +// ... +// Tail: +// bl _foo +// +// Becomes: +// +// Head: +// cmp w0, #5 +// ccmp w1, #17, 4, ne ; 4 = nZcv +// b.eq Tail +// ... +// Tail: +// bl _foo +// +// The ccmp condition code is the one that would cause the Head terminator to +// branch to CmpBB. +// +// FIXME: It should also be possible to speculate a block on the critical edge +// between Head and Tail, just like if-converting a diamond. +// +// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion). + +namespace { +class SSACCmpConv { + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + +public: + /// The first block containing a conditional branch, dominating everything + /// else. + MachineBasicBlock *Head; + + /// The block containing cmp+br.cond with a successor shared with Head. + MachineBasicBlock *CmpBB; + + /// The common successor for Head and CmpBB. + MachineBasicBlock *Tail; + + /// The compare instruction in CmpBB that can be converted to a ccmp. + MachineInstr *CmpMI; + +private: + /// The branch condition in Head as determined by AnalyzeBranch. + SmallVector<MachineOperand, 4> HeadCond; + + /// The condition code that makes Head branch to CmpBB. + AArch64CC::CondCode HeadCmpBBCC; + + /// The branch condition in CmpBB. + SmallVector<MachineOperand, 4> CmpBBCond; + + /// The condition code that makes CmpBB branch to Tail. + AArch64CC::CondCode CmpBBTailCC; + + /// Check if the Tail PHIs are trivially convertible. + bool trivialTailPHIs(); + + /// Remove CmpBB from the Tail PHIs. + void updateTailPHIs(); + + /// Check if an operand defining DstReg is dead. + bool isDeadDef(unsigned DstReg); + + /// Find the compare instruction in MBB that controls the conditional branch. + /// Return NULL if a convertible instruction can't be found. + MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB); + + /// Return true if all non-terminator instructions in MBB can be safely + /// speculated. + bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI); + +public: + /// runOnMachineFunction - Initialize per-function data structures. + void runOnMachineFunction(MachineFunction &MF) { + this->MF = &MF; + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + } + + /// If the sub-CFG headed by MBB can be cmp-converted, initialize the + /// internal state, and return true. + bool canConvert(MachineBasicBlock *MBB); + + /// Cmo-convert the last block passed to canConvertCmp(), assuming + /// it is possible. Add any erased blocks to RemovedBlocks. + void convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks); + + /// Return the expected code size delta if the conversion into a + /// conditional compare is performed. + int expectedCodeSizeDelta() const; +}; +} // end anonymous namespace + +// Check that all PHIs in Tail are selecting the same value from Head and CmpBB. +// This means that no if-conversion is required when merging CmpBB into Head. +bool SSACCmpConv::trivialTailPHIs() { + for (auto &I : *Tail) { + if (!I.isPHI()) + break; + unsigned HeadReg = 0, CmpBBReg = 0; + // PHI operands come in (VReg, MBB) pairs. + for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { + MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB(); + unsigned Reg = I.getOperand(oi).getReg(); + if (MBB == Head) { + assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands"); + HeadReg = Reg; + } + if (MBB == CmpBB) { + assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands"); + CmpBBReg = Reg; + } + } + if (HeadReg != CmpBBReg) + return false; + } + return true; +} + +// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply +// removing the CmpBB operands. The Head operands will be identical. +void SSACCmpConv::updateTailPHIs() { + for (auto &I : *Tail) { + if (!I.isPHI()) + break; + // I is a PHI. It can have multiple entries for CmpBB. + for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) { + // PHI operands are (Reg, MBB) at (oi-2, oi-1). + if (I.getOperand(oi - 1).getMBB() == CmpBB) { + I.RemoveOperand(oi - 1); + I.RemoveOperand(oi - 2); + } + } + } +} + +// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares +// are still writing virtual registers without any uses. +bool SSACCmpConv::isDeadDef(unsigned DstReg) { + // Writes to the zero register are dead. + if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) + return true; + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + // A virtual register def without any uses will be marked dead later, and + // eventually replaced by the zero register. + return MRI->use_nodbg_empty(DstReg); +} + +// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// corresponding to TBB. +// Return +static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { + // A normal br.cond simply has the condition code. + if (Cond[0].getImm() != -1) { + assert(Cond.size() == 1 && "Unknown Cond array format"); + CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + return true; + } + // For tbz and cbz instruction, the opcode is next. + switch (Cond[1].getImm()) { + default: + // This includes tbz / tbnz branches which can't be converted to + // ccmp + br.cond. + return false; + case AArch64::CBZW: + case AArch64::CBZX: + assert(Cond.size() == 3 && "Unknown Cond array format"); + CC = AArch64CC::EQ; + return true; + case AArch64::CBNZW: + case AArch64::CBNZX: + assert(Cond.size() == 3 && "Unknown Cond array format"); + CC = AArch64CC::NE; + return true; + } +} + +MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { + MachineBasicBlock::iterator I = MBB->getFirstTerminator(); + if (I == MBB->end()) + return nullptr; + // The terminator must be controlled by the flags. + if (!I->readsRegister(AArch64::NZCV)) { + switch (I->getOpcode()) { + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + // These can be converted into a ccmp against #0. + return I; + } + ++NumCmpTermRejs; + DEBUG(dbgs() << "Flags not used by terminator: " << *I); + return nullptr; + } + + // Now find the instruction controlling the terminator. + for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { + --I; + assert(!I->isTerminator() && "Spurious terminator"); + switch (I->getOpcode()) { + // cmp is an alias for subs with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + // cmn is an alias for adds with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: + // Check that the immediate operand is within range, ccmp wants a uimm5. + // Rd = SUBSri Rn, imm, shift + if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) { + DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I); + ++NumImmRangeRejs; + return nullptr; + } + // Fall through. + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + if (isDeadDef(I->getOperand(0).getReg())) + return I; + DEBUG(dbgs() << "Can't convert compare with live destination: " << *I); + ++NumLiveDstRejs; + return nullptr; + case AArch64::FCMPSrr: + case AArch64::FCMPDrr: + case AArch64::FCMPESrr: + case AArch64::FCMPEDrr: + return I; + } + + // Check for flag reads and clobbers. + MIOperands::PhysRegInfo PRI = + MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); + + if (PRI.Read) { + // The ccmp doesn't produce exactly the same flags as the original + // compare, so reject the transform if there are uses of the flags + // besides the terminators. + DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I); + ++NumMultNZCVUses; + return nullptr; + } + + if (PRI.Defined || PRI.Clobbered) { + DEBUG(dbgs() << "Not convertible compare: " << *I); + ++NumUnknNZCVDefs; + return nullptr; + } + } + DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + return nullptr; +} + +/// Determine if all the instructions in MBB can safely +/// be speculated. The terminators are not considered. +/// +/// Only CmpMI is allowed to clobber the flags. +/// +bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, + const MachineInstr *CmpMI) { + // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to + // get right. + if (!MBB->livein_empty()) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); + return false; + } + + unsigned InstrCount = 0; + + // Check all instructions, except the terminators. It is assumed that + // terminators never have side effects or define any used register values. + for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) { + if (I.isDebugValue()) + continue; + + if (++InstrCount > BlockInstrLimit && !Stress) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " + << BlockInstrLimit << " instructions.\n"); + return false; + } + + // There shouldn't normally be any phis in a single-predecessor block. + if (I.isPHI()) { + DEBUG(dbgs() << "Can't hoist: " << I); + return false; + } + + // Don't speculate loads. Note that it may be possible and desirable to + // speculate GOT or constant pool loads that are guaranteed not to trap, + // but we don't support that for now. + if (I.mayLoad()) { + DEBUG(dbgs() << "Won't speculate load: " << I); + return false; + } + + // We never speculate stores, so an AA pointer isn't necessary. + bool DontMoveAcrossStore = true; + if (!I.isSafeToMove(nullptr, DontMoveAcrossStore)) { + DEBUG(dbgs() << "Can't speculate: " << I); + return false; + } + + // Only CmpMI is allowed to clobber the flags. + if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) { + DEBUG(dbgs() << "Clobbers flags: " << I); + return false; + } + } + return true; +} + +/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential +/// candidate for cmp-conversion. Fill out the internal state. +/// +bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { + Head = MBB; + Tail = CmpBB = nullptr; + + if (Head->succ_size() != 2) + return false; + MachineBasicBlock *Succ0 = Head->succ_begin()[0]; + MachineBasicBlock *Succ1 = Head->succ_begin()[1]; + + // CmpBB can only have a single predecessor. Tail is allowed many. + if (Succ0->pred_size() != 1) + std::swap(Succ0, Succ1); + + // Succ0 is our candidate for CmpBB. + if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2) + return false; + + CmpBB = Succ0; + Tail = Succ1; + + if (!CmpBB->isSuccessor(Tail)) + return false; + + // The CFG topology checks out. + DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#" + << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n'); + ++NumConsidered; + + // Tail is allowed to have many predecessors, but we can't handle PHIs yet. + // + // FIXME: Real PHIs could be if-converted as long as the CmpBB values are + // defined before The CmpBB cmp clobbers the flags. Alternatively, it should + // always be safe to sink the ccmp down to immediately before the CmpBB + // terminators. + if (!trivialTailPHIs()) { + DEBUG(dbgs() << "Can't handle phis in Tail.\n"); + ++NumPhiRejs; + return false; + } + + if (!Tail->livein_empty()) { + DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n"); + ++NumPhysRejs; + return false; + } + + // CmpBB should never have PHIs since Head is its only predecessor. + // FIXME: Clean them up if it happens. + if (!CmpBB->empty() && CmpBB->front().isPHI()) { + DEBUG(dbgs() << "Can't handle phis in CmpBB.\n"); + ++NumPhi2Rejs; + return false; + } + + if (!CmpBB->livein_empty()) { + DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n"); + ++NumPhysRejs; + return false; + } + + // The branch we're looking to eliminate must be analyzable. + HeadCond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) { + DEBUG(dbgs() << "Head branch not analyzable.\n"); + ++NumHeadBranchRejs; + return false; + } + + // This is weird, probably some sort of degenerate CFG, or an edge to a + // landing pad. + if (!TBB || HeadCond.empty()) { + DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n"); + ++NumHeadBranchRejs; + return false; + } + + if (!parseCond(HeadCond, HeadCmpBBCC)) { + DEBUG(dbgs() << "Unsupported branch type on Head\n"); + ++NumHeadBranchRejs; + return false; + } + + // Make sure the branch direction is right. + if (TBB != CmpBB) { + assert(TBB == Tail && "Unexpected TBB"); + HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC); + } + + CmpBBCond.clear(); + TBB = FBB = nullptr; + if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { + DEBUG(dbgs() << "CmpBB branch not analyzable.\n"); + ++NumCmpBranchRejs; + return false; + } + + if (!TBB || CmpBBCond.empty()) { + DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n"); + ++NumCmpBranchRejs; + return false; + } + + if (!parseCond(CmpBBCond, CmpBBTailCC)) { + DEBUG(dbgs() << "Unsupported branch type on CmpBB\n"); + ++NumCmpBranchRejs; + return false; + } + + if (TBB != Tail) + CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC); + + DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC) + << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC) + << '\n'); + + CmpMI = findConvertibleCompare(CmpBB); + if (!CmpMI) + return false; + + if (!canSpeculateInstrs(CmpBB, CmpMI)) { + ++NumSpeculateRejs; + return false; + } + return true; +} + +void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { + DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#" + << Head->getNumber() << ":\n" << *CmpBB); + + // All CmpBB instructions are moved into Head, and CmpBB is deleted. + // Update the CFG first. + updateTailPHIs(); + Head->removeSuccessor(CmpBB, true); + CmpBB->removeSuccessor(Tail, true); + Head->transferSuccessorsAndUpdatePHIs(CmpBB); + DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); + TII->RemoveBranch(*Head); + + // If the Head terminator was one of the cbz / tbz branches with built-in + // compare, we need to insert an explicit compare instruction in its place. + if (HeadCond[0].getImm() == -1) { + ++NumCompBranches; + unsigned Opc = 0; + switch (HeadCond[1].getImm()) { + case AArch64::CBZW: + case AArch64::CBNZW: + Opc = AArch64::SUBSWri; + break; + case AArch64::CBZX: + case AArch64::CBNZX: + Opc = AArch64::SUBSXri; + break; + default: + llvm_unreachable("Cannot convert Head branch"); + } + const MCInstrDesc &MCID = TII->get(Opc); + // Create a dummy virtual register for the SUBS def. + unsigned DestReg = + MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF)); + // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. + BuildMI(*Head, Head->end(), TermDL, MCID) + .addReg(DestReg, RegState::Define | RegState::Dead) + .addOperand(HeadCond[2]) + .addImm(0) + .addImm(0); + // SUBS uses the GPR*sp register classes. + MRI->constrainRegClass(HeadCond[2].getReg(), + TII->getRegClass(MCID, 1, TRI, *MF)); + } + + Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end()); + + // Now replace CmpMI with a ccmp instruction that also considers the incoming + // flags. + unsigned Opc = 0; + unsigned FirstOp = 1; // First CmpMI operand to copy. + bool isZBranch = false; // CmpMI is a cbz/cbnz instruction. + switch (CmpMI->getOpcode()) { + default: + llvm_unreachable("Unknown compare opcode"); + case AArch64::SUBSWri: Opc = AArch64::CCMPWi; break; + case AArch64::SUBSWrr: Opc = AArch64::CCMPWr; break; + case AArch64::SUBSXri: Opc = AArch64::CCMPXi; break; + case AArch64::SUBSXrr: Opc = AArch64::CCMPXr; break; + case AArch64::ADDSWri: Opc = AArch64::CCMNWi; break; + case AArch64::ADDSWrr: Opc = AArch64::CCMNWr; break; + case AArch64::ADDSXri: Opc = AArch64::CCMNXi; break; + case AArch64::ADDSXrr: Opc = AArch64::CCMNXr; break; + case AArch64::FCMPSrr: Opc = AArch64::FCCMPSrr; FirstOp = 0; break; + case AArch64::FCMPDrr: Opc = AArch64::FCCMPDrr; FirstOp = 0; break; + case AArch64::FCMPESrr: Opc = AArch64::FCCMPESrr; FirstOp = 0; break; + case AArch64::FCMPEDrr: Opc = AArch64::FCCMPEDrr; FirstOp = 0; break; + case AArch64::CBZW: + case AArch64::CBNZW: + Opc = AArch64::CCMPWi; + FirstOp = 0; + isZBranch = true; + break; + case AArch64::CBZX: + case AArch64::CBNZX: + Opc = AArch64::CCMPXi; + FirstOp = 0; + isZBranch = true; + break; + } + + // The ccmp instruction should set the flags according to the comparison when + // Head would have branched to CmpBB. + // The NZCV immediate operand should provide flags for the case where Head + // would have branched to Tail. These flags should cause the new Head + // terminator to branch to tail. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC); + const MCInstrDesc &MCID = TII->get(Opc); + MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(), + TII->getRegClass(MCID, 0, TRI, *MF)); + if (CmpMI->getOperand(FirstOp + 1).isReg()) + MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), + TII->getRegClass(MCID, 1, TRI, *MF)); + MachineInstrBuilder MIB = + BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) + .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn + if (isZBranch) + MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0 + else + MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate + MIB.addImm(NZCV).addImm(HeadCmpBBCC); + + // If CmpMI was a terminator, we need a new conditional branch to replace it. + // This now becomes a Head terminator. + if (isZBranch) { + bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW || + CmpMI->getOpcode() == AArch64::CBNZX; + BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ) + .addOperand(CmpMI->getOperand(1)); // Branch target. + } + CmpMI->eraseFromParent(); + Head->updateTerminator(); + + RemovedBlocks.push_back(CmpBB); + CmpBB->eraseFromParent(); + DEBUG(dbgs() << "Result:\n" << *Head); + ++NumConverted; +} + +int SSACCmpConv::expectedCodeSizeDelta() const { + int delta = 0; + // If the Head terminator was one of the cbz / tbz branches with built-in + // compare, we need to insert an explicit compare instruction in its place + // plus a branch instruction. + if (HeadCond[0].getImm() == -1) { + switch (HeadCond[1].getImm()) { + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + // Therefore delta += 1 + delta = 1; + break; + default: + llvm_unreachable("Cannot convert Head branch"); + } + } + // If the Cmp terminator was one of the cbz / tbz branches with + // built-in compare, it will be turned into a compare instruction + // into Head, but we do not save any instruction. + // Otherwise, we save the branch instruction. + switch (CmpMI->getOpcode()) { + default: + --delta; + break; + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + break; + } + return delta; +} + +//===----------------------------------------------------------------------===// +// AArch64ConditionalCompares Pass +//===----------------------------------------------------------------------===// + +namespace { +class AArch64ConditionalCompares : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MCSchedModel SchedModel; + // Does the proceeded function has Oz attribute. + bool MinSize; + MachineRegisterInfo *MRI; + MachineDominatorTree *DomTree; + MachineLoopInfo *Loops; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + SSACCmpConv CmpConv; + +public: + static char ID; + AArch64ConditionalCompares() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 Conditional Compares"; + } + +private: + bool tryConvert(MachineBasicBlock *); + void updateDomTree(ArrayRef<MachineBasicBlock *> Removed); + void updateLoops(ArrayRef<MachineBasicBlock *> Removed); + void invalidateTraces(); + bool shouldConvert(); +}; +} // end anonymous namespace + +char AArch64ConditionalCompares::ID = 0; + +namespace llvm { +void initializeAArch64ConditionalComparesPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", + "AArch64 CCMP Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", + "AArch64 CCMP Pass", false, false) + +FunctionPass *llvm::createAArch64ConditionalCompares() { + return new AArch64ConditionalCompares(); +} + +void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + AU.addRequired<MachineTraceMetrics>(); + AU.addPreserved<MachineTraceMetrics>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +/// Update the dominator tree after if-conversion erased some blocks. +void AArch64ConditionalCompares::updateDomTree( + ArrayRef<MachineBasicBlock *> Removed) { + // convert() removes CmpBB which was previously dominated by Head. + // CmpBB children should be transferred to Head. + MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); + for (MachineBasicBlock *RemovedMBB : Removed) { + MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB); + assert(Node != HeadNode && "Cannot erase the head node"); + assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); + while (Node->getNumChildren()) + DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); + DomTree->eraseNode(RemovedMBB); + } +} + +/// Update LoopInfo after if-conversion. +void +AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) { + if (!Loops) + return; + for (MachineBasicBlock *RemovedMBB : Removed) + Loops->removeBlock(RemovedMBB); +} + +/// Invalidate MachineTraceMetrics before if-conversion. +void AArch64ConditionalCompares::invalidateTraces() { + Traces->invalidate(CmpConv.Head); + Traces->invalidate(CmpConv.CmpBB); +} + +/// Apply cost model and heuristics to the if-conversion in IfConv. +/// Return true if the conversion is a good idea. +/// +bool AArch64ConditionalCompares::shouldConvert() { + // Stress testing mode disables all cost considerations. + if (Stress) + return true; + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + // Head dominates CmpBB, so it is always included in its trace. + MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB); + + // If code size is the main concern + if (MinSize) { + int CodeSizeDelta = CmpConv.expectedCodeSizeDelta(); + DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n'); + // If we are minimizing the code size, do the conversion whatever + // the cost is. + if (CodeSizeDelta < 0) + return true; + if (CodeSizeDelta > 0) { + DEBUG(dbgs() << "Code size is increasing, give up on this one.\n"); + return false; + } + // CodeSizeDelta == 0, continue with the regular heuristics + } + + // Heuristic: The compare conversion delays the execution of the branch + // instruction because we must wait for the inputs to the second compare as + // well. The branch has no dependent instructions, but delaying it increases + // the cost of a misprediction. + // + // Set a limit on the delay we will accept. + unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4; + + // Instruction depths can be computed for all trace instructions above CmpBB. + unsigned HeadDepth = + Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth; + unsigned CmpBBDepth = + Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth; + DEBUG(dbgs() << "Head depth: " << HeadDepth + << "\nCmpBB depth: " << CmpBBDepth << '\n'); + if (CmpBBDepth > HeadDepth + DelayLimit) { + DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit + << " cycles.\n"); + return false; + } + + // Check the resource depth at the bottom of CmpBB - these instructions will + // be speculated. + unsigned ResDepth = Trace.getResourceDepth(true); + DEBUG(dbgs() << "Resources: " << ResDepth << '\n'); + + // Heuristic: The speculatively executed instructions must all be able to + // merge into the Head block. The Head critical path should dominate the + // resource cost of the speculated instructions. + if (ResDepth > HeadDepth) { + DEBUG(dbgs() << "Too many instructions to speculate.\n"); + return false; + } + return true; +} + +bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { + bool Changed = false; + while (CmpConv.canConvert(MBB) && shouldConvert()) { + invalidateTraces(); + SmallVector<MachineBasicBlock *, 4> RemovedBlocks; + CmpConv.convert(RemovedBlocks); + Changed = true; + updateDomTree(RemovedBlocks); + updateLoops(RemovedBlocks); + } + return Changed; +} + +bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" + << "********** Function: " << MF.getName() << '\n'); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); + SchedModel = MF.getSubtarget().getSchedModel(); + MRI = &MF.getRegInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + Loops = getAnalysisIfAvailable<MachineLoopInfo>(); + Traces = &getAnalysis<MachineTraceMetrics>(); + MinInstr = nullptr; + MinSize = MF.getFunction()->optForMinSize(); + + bool Changed = false; + CmpConv.runOnMachineFunction(MF); + + // Visit blocks in dominator tree pre-order. The pre-order enables multiple + // cmp-conversions from the same head block. + // Note that updateDomTree() modifies the children of the DomTree node + // currently being visited. The df_iterator supports that; it doesn't look at + // child_begin() / child_end() until after a node has been visited. + for (auto *I : depth_first(DomTree)) + if (tryConvert(I->getBlock())) + Changed = true; + + return Changed; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp new file mode 100644 index 0000000..576cf4a --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -0,0 +1,147 @@ +//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// When allowed by the instruction, replace a dead definition of a GPR with +// the zero register. This makes the code a bit friendlier towards the +// hardware's register renamer. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetSubtargetInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-dead-defs" + +STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); + +namespace llvm { +void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &); +} + +#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions" + +namespace { +class AArch64DeadRegisterDefinitions : public MachineFunctionPass { +private: + const TargetRegisterInfo *TRI; + bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI); + bool processMachineBasicBlock(MachineBasicBlock &MBB); + bool usesFrameIndex(const MachineInstr &MI); +public: + static char ID; // Pass identification, replacement for typeid. + explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) { + initializeAArch64DeadRegisterDefinitionsPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64DeadRegisterDefinitions::ID = 0; +} // end anonymous namespace + +INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs", + AARCH64_DEAD_REG_DEF_NAME, false, false) + +bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( + unsigned Reg, const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) + if (MO.isReg() && MO.isDef()) + if (TRI->regsOverlap(Reg, MO.getReg())) + return true; + return false; +} + +bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) { + for (const MachineOperand &Op : MI.uses()) + if (Op.isFI()) + return true; + return false; +} + +bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( + MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineInstr &MI : MBB) { + if (usesFrameIndex(MI)) { + // We need to skip this instruction because while it appears to have a + // dead def it uses a frame index which might expand into a multi + // instruction sequence during EPI. + DEBUG(dbgs() << " Ignoring, operand is frame index\n"); + continue; + } + for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); + if (MO.isReg() && MO.isDead() && MO.isDef()) { + assert(!MO.isImplicit() && "Unexpected implicit def!"); + DEBUG(dbgs() << " Dead def operand #" << i << " in:\n "; + MI.print(dbgs())); + // Be careful not to change the register if it's a tied operand. + if (MI.isRegTiedToUseOperand(i)) { + DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); + continue; + } + // Don't change the register if there's an implicit def of a subreg or + // supperreg. + if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) { + DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n"); + continue; + } + // Make sure the instruction take a register class that contains + // the zero register and replace it if so. + unsigned NewReg; + switch (MI.getDesc().OpInfo[i].RegClass) { + default: + DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); + continue; + case AArch64::GPR32RegClassID: + NewReg = AArch64::WZR; + break; + case AArch64::GPR64RegClassID: + NewReg = AArch64::XZR; + break; + } + DEBUG(dbgs() << " Replacing with zero register. New:\n "); + MO.setReg(NewReg); + DEBUG(MI.print(dbgs())); + ++NumDeadDefsReplaced; + } + } + } + return Changed; +} + +// Scan the function for instructions that have a dead definition of a +// register. Replace that register with the zero register when possible. +bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { + TRI = MF.getSubtarget().getRegisterInfo(); + bool Changed = false; + DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); + + for (auto &MBB : MF) + if (processMachineBasicBlock(MBB)) + Changed = true; + return Changed; +} + +FunctionPass *llvm::createAArch64DeadRegisterDefinitions() { + return new AArch64DeadRegisterDefinitions(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp new file mode 100644 index 0000000..d24e42a --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -0,0 +1,751 @@ +//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling and other late optimizations. This +// pass should be run after register allocation but before the post-regalloc +// scheduling pass. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/MathExtras.h" +using namespace llvm; + +namespace llvm { +void initializeAArch64ExpandPseudoPass(PassRegistry &); +} + +#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" + +namespace { +class AArch64ExpandPseudo : public MachineFunctionPass { +public: + static char ID; + AArch64ExpandPseudo() : MachineFunctionPass(ID) { + initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); + } + + const AArch64InstrInfo *TII; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return AARCH64_EXPAND_PSEUDO_NAME; + } + +private: + bool expandMBB(MachineBasicBlock &MBB); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned BitSize); +}; +char AArch64ExpandPseudo::ID = 0; +} + +INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", + AARCH64_EXPAND_PSEUDO_NAME, false, false) + +/// \brief Transfer implicit operands on the pseudo instruction to the +/// instructions created from the expansion. +static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, + MachineInstrBuilder &DefMI) { + const MCInstrDesc &Desc = OldMI.getDesc(); + for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; + ++i) { + const MachineOperand &MO = OldMI.getOperand(i); + assert(MO.isReg() && MO.getReg()); + if (MO.isUse()) + UseMI.addOperand(MO); + else + DefMI.addOperand(MO); + } +} + +/// \brief Helper function which extracts the specified 16-bit chunk from a +/// 64-bit value. +static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { + assert(ChunkIdx < 4 && "Out of range chunk index specified!"); + + return (Imm >> (ChunkIdx * 16)) & 0xFFFF; +} + +/// \brief Helper function which replicates a 16-bit chunk within a 64-bit +/// value. Indices correspond to element numbers in a v4i16. +static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) { + assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!"); + const unsigned ShiftAmt = ToIdx * 16; + + // Replicate the source chunk to the destination position. + const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt; + // Clear the destination chunk. + Imm &= ~(0xFFFFLL << ShiftAmt); + // Insert the replicated chunk. + return Imm | Chunk; +} + +/// \brief Helper function which tries to materialize a 64-bit value with an +/// ORR + MOVK instruction sequence. +static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII, unsigned ChunkIdx) { + assert(ChunkIdx < 4 && "Out of range chunk index specified!"); + const unsigned ShiftAmt = ChunkIdx * 16; + + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) { + // Create the ORR-immediate instruction. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + // Create the MOVK instruction. + const unsigned Imm16 = getChunk(UImm, ChunkIdx); + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + return false; +} + +/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width +/// can be materialized with an ORR instruction. +static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { + Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; + + return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); +} + +/// \brief Check for identical 16-bit chunks within the constant and if so +/// materialize them with a single ORR instruction. The remaining one or two +/// 16-bit chunks will be materialized with MOVK instructions. +/// +/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order +/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with +/// an ORR instruction. +/// +static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII) { + typedef DenseMap<uint64_t, unsigned> CountMap; + CountMap Counts; + + // Scan the constant and count how often every chunk occurs. + for (unsigned Idx = 0; Idx < 4; ++Idx) + ++Counts[getChunk(UImm, Idx)]; + + // Traverse the chunks to find one which occurs more than once. + for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); + Chunk != End; ++Chunk) { + const uint64_t ChunkVal = Chunk->first; + const unsigned Count = Chunk->second; + + uint64_t Encoding = 0; + + // We are looking for chunks which have two or three instances and can be + // materialized with an ORR instruction. + if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) + continue; + + const bool CountThree = Count == 3; + // Create the ORR-immediate instruction. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + + unsigned ShiftAmt = 0; + uint64_t Imm16 = 0; + // Find the first chunk not materialized with the ORR instruction. + for (; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + + // Create the first MOVK instruction. + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, + RegState::Define | getDeadRegState(DstIsDead && CountThree)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + // In case we have three instances the whole constant is now materialized + // and we can exit. + if (CountThree) { + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + // Find the remaining chunk which needs to be materialized. + for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + + // Create the second MOVK instruction. + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + transferImpOps(MI, MIB, MIB2); + MI.eraseFromParent(); + return true; + } + + return false; +} + +/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern +/// starts a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isStartChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == UINT64_MAX) + return false; + + return isMask_64(~Chunk); +} + +/// \brief Check whether this chunk matches the pattern '0...1...' This pattern +/// ends a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isEndChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == UINT64_MAX) + return false; + + return isMask_64(Chunk); +} + +/// \brief Clear or set all bits in the chunk at the given index. +static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { + const uint64_t Mask = 0xFFFF; + + if (Clear) + // Clear chunk in the immediate. + Imm &= ~(Mask << (Idx * 16)); + else + // Set all bits in the immediate for the particular chunk. + Imm |= Mask << (Idx * 16); + + return Imm; +} + +/// \brief Check whether the constant contains a sequence of contiguous ones, +/// which might be interrupted by one or two chunks. If so, materialize the +/// sequence of contiguous ones with an ORR instruction. +/// Materialize the chunks which are either interrupting the sequence or outside +/// of the sequence with a MOVK instruction. +/// +/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk +/// which ends the sequence (0...1...). Then we are looking for constants which +/// contain at least one S and E chunk. +/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. +/// +/// We are also looking for constants like |S|A|B|E| where the contiguous +/// sequence of ones wraps around the MSB into the LSB. +/// +static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII) { + const int NotSet = -1; + const uint64_t Mask = 0xFFFF; + + int StartIdx = NotSet; + int EndIdx = NotSet; + // Try to find the chunks which start/end a contiguous sequence of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + int64_t Chunk = getChunk(UImm, Idx); + // Sign extend the 16-bit chunk to 64-bit. + Chunk = (Chunk << 48) >> 48; + + if (isStartChunk(Chunk)) + StartIdx = Idx; + else if (isEndChunk(Chunk)) + EndIdx = Idx; + } + + // Early exit in case we can't find a start/end chunk. + if (StartIdx == NotSet || EndIdx == NotSet) + return false; + + // Outside of the contiguous sequence of ones everything needs to be zero. + uint64_t Outside = 0; + // Chunks between the start and end chunk need to have all their bits set. + uint64_t Inside = Mask; + + // If our contiguous sequence of ones wraps around from the MSB into the LSB, + // just swap indices and pretend we are materializing a contiguous sequence + // of zeros surrounded by a contiguous sequence of ones. + if (StartIdx > EndIdx) { + std::swap(StartIdx, EndIdx); + std::swap(Outside, Inside); + } + + uint64_t OrrImm = UImm; + int FirstMovkIdx = NotSet; + int SecondMovkIdx = NotSet; + + // Find out which chunks we need to patch up to obtain a contiguous sequence + // of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + const uint64_t Chunk = getChunk(UImm, Idx); + + // Check whether we are looking at a chunk which is not part of the + // contiguous sequence of ones. + if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { + OrrImm = updateImm(OrrImm, Idx, Outside == 0); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + + // Check whether we are looking a chunk which is part of the contiguous + // sequence of ones. + } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { + OrrImm = updateImm(OrrImm, Idx, Inside != Mask); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + } + } + assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); + + // Create the ORR-immediate instruction. + uint64_t Encoding = 0; + AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + + const bool SingleMovk = SecondMovkIdx == NotSet; + // Create the first MOVK instruction. + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, + RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) + .addReg(DstReg) + .addImm(getChunk(UImm, FirstMovkIdx)) + .addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16)); + + // Early exit in case we only need to emit a single MOVK instruction. + if (SingleMovk) { + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + // Create the second MOVK instruction. + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(getChunk(UImm, SecondMovkIdx)) + .addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16)); + + transferImpOps(MI, MIB, MIB2); + MI.eraseFromParent(); + return true; +} + +/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more +/// real move-immediate instructions to synthesize the immediate. +bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned BitSize) { + MachineInstr &MI = *MBBI; + uint64_t Imm = MI.getOperand(1).getImm(); + const unsigned Mask = 0xFFFF; + + // Try a MOVI instruction (aka ORR-immediate with the zero register). + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addOperand(MI.getOperand(0)) + .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) + .addImm(Encoding); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + + // Scan the immediate and count the number of 16-bit chunks which are either + // all ones or all zeros. + unsigned OneChunks = 0; + unsigned ZeroChunks = 0; + for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { + const unsigned Chunk = (Imm >> Shift) & Mask; + if (Chunk == Mask) + OneChunks++; + else if (Chunk == 0) + ZeroChunks++; + } + + // Since we can't materialize the constant with a single ORR instruction, + // let's see whether we can materialize 3/4 of the constant with an ORR + // instruction and use an additional MOVK instruction to materialize the + // remaining 1/4. + // + // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|. + // + // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR, + // we would create the following instruction sequence: + // + // ORR x0, xzr, |A|X|A|X| + // MOVK x0, |B|, LSL #16 + // + // Only look at 64-bit constants which can't be materialized with a single + // instruction e.g. which have less than either three all zero or all one + // chunks. + // + // Ignore 32-bit constants here, they always can be materialized with a + // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized + // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair. + // Thus we fall back to the default code below which in the best case creates + // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one). + // + if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) { + // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2 + // identical? + if (getChunk(UImm, 0) == getChunk(UImm, 2)) { + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 3 into element 1. + uint64_t OrrImm = replicateChunk(UImm, 3, 1); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1)) + return true; + + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 1 into element 3. + OrrImm = replicateChunk(UImm, 1, 3); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3)) + return true; + + // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3 + // identical? + } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) { + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 2 into element 0. + uint64_t OrrImm = replicateChunk(UImm, 2, 0); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0)) + return true; + + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 1 into element 3. + OrrImm = replicateChunk(UImm, 0, 2); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2)) + return true; + } + } + + // Check for identical 16-bit chunks within the constant and if so materialize + // them with a single ORR instruction. The remaining one or two 16-bit chunks + // will be materialized with MOVK instructions. + if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)) + return true; + + // Check whether the constant contains a sequence of contiguous ones, which + // might be interrupted by one or two chunks. If so, materialize the sequence + // of contiguous ones with an ORR instruction. Materialize the chunks which + // are either interrupting the sequence or outside of the sequence with a + // MOVK instruction. + if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) + return true; + + // Use a MOVZ or MOVN instruction to set the high bits, followed by one or + // more MOVK instructions to insert additional 16-bit portions into the + // lower bits. + bool isNeg = false; + + // Use MOVN to materialize the high bits if we have more all one chunks + // than all zero chunks. + if (OneChunks > ZeroChunks) { + isNeg = true; + Imm = ~Imm; + } + + unsigned FirstOpc; + if (BitSize == 32) { + Imm &= (1LL << 32) - 1; + FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); + } else { + FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); + } + unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN + unsigned LastShift = 0; // LSL amount for last MOVK + if (Imm != 0) { + unsigned LZ = countLeadingZeros(Imm); + unsigned TZ = countTrailingZeros(Imm); + Shift = ((63 - LZ) / 16) * 16; + LastShift = (TZ / 16) * 16; + } + unsigned Imm16 = (Imm >> Shift) & Mask; + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && Shift == LastShift)) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + + // If a MOVN was used for the high bits of a negative value, flip the rest + // of the bits back for use with MOVK. + if (isNeg) + Imm = ~Imm; + + if (Shift == LastShift) { + transferImpOps(MI, MIB1, MIB1); + MI.eraseFromParent(); + return true; + } + + MachineInstrBuilder MIB2; + unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); + while (Shift != LastShift) { + Shift -= 16; + Imm16 = (Imm >> Shift) & Mask; + if (Imm16 == (isNeg ? Mask : 0)) + continue; // This 16-bit portion is already set correctly. + MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(DstIsDead && Shift == LastShift)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + } + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; +} + +/// \brief If MBBI references a pseudo instruction that should be expanded here, +/// do the expansion and return true. Otherwise return false. +bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + + case AArch64::ADDWrr: + case AArch64::SUBWrr: + case AArch64::ADDXrr: + case AArch64::SUBXrr: + case AArch64::ADDSWrr: + case AArch64::SUBSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBSXrr: + case AArch64::ANDWrr: + case AArch64::ANDXrr: + case AArch64::BICWrr: + case AArch64::BICXrr: + case AArch64::ANDSWrr: + case AArch64::ANDSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::EONWrr: + case AArch64::EONXrr: + case AArch64::EORWrr: + case AArch64::EORXrr: + case AArch64::ORNWrr: + case AArch64::ORNXrr: + case AArch64::ORRWrr: + case AArch64::ORRXrr: { + unsigned Opcode; + switch (MI.getOpcode()) { + default: + return false; + case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break; + case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break; + case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break; + case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break; + case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break; + case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break; + case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break; + case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break; + case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break; + case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break; + case AArch64::BICWrr: Opcode = AArch64::BICWrs; break; + case AArch64::BICXrr: Opcode = AArch64::BICXrs; break; + case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break; + case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break; + case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break; + case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break; + case AArch64::EONWrr: Opcode = AArch64::EONWrs; break; + case AArch64::EONXrr: Opcode = AArch64::EONXrs; break; + case AArch64::EORWrr: Opcode = AArch64::EORWrs; break; + case AArch64::EORXrr: Opcode = AArch64::EORXrs; break; + case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break; + case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break; + case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break; + case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break; + } + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + transferImpOps(MI, MIB1, MIB1); + MI.eraseFromParent(); + return true; + } + + case AArch64::LOADgot: { + // Expand into ADRP + LDR. + unsigned DstReg = MI.getOperand(0).getReg(); + const MachineOperand &MO1 = MI.getOperand(1); + unsigned Flags = MO1.getTargetFlags(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) + .addOperand(MI.getOperand(0)) + .addReg(DstReg); + + if (MO1.isGlobal()) { + MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); + MIB2.addGlobalAddress(MO1.getGlobal(), 0, + Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } else if (MO1.isSymbol()) { + MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE); + MIB2.addExternalSymbol(MO1.getSymbolName(), + Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } else { + assert(MO1.isCPI() && + "Only expect globals, externalsymbols, or constant pools"); + MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), + Flags | AArch64II::MO_PAGE); + MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), + Flags | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case AArch64::MOVaddr: + case AArch64::MOVaddrJT: + case AArch64::MOVaddrCP: + case AArch64::MOVaddrBA: + case AArch64::MOVaddrTLS: + case AArch64::MOVaddrEXT: { + // Expand into ADRP + ADD. + unsigned DstReg = MI.getOperand(0).getReg(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) + .addOperand(MI.getOperand(1)); + + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) + .addOperand(MI.getOperand(0)) + .addReg(DstReg) + .addOperand(MI.getOperand(2)) + .addImm(0); + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case AArch64::MOVi32imm: + return expandMOVImm(MBB, MBBI, 32); + case AArch64::MOVi64imm: + return expandMOVImm(MBB, MBBI, 64); + case AArch64::RET_ReallyLR: { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET)) + .addReg(AArch64::LR); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + } + return false; +} + +/// \brief Iterate over the instructions in basic block MBB and expand any +/// pseudo instructions. Return true if anything was modified. +bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= expandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) + Modified |= expandMBB(MBB); + return Modified; +} + +/// \brief Returns an instance of the pseudo instruction expansion pass. +FunctionPass *llvm::createAArch64ExpandPseudoPass() { + return new AArch64ExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp new file mode 100644 index 0000000..0ac4b39 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -0,0 +1,4963 @@ +//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AArch64-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// AArch64GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64CallingConvention.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +namespace { + +class AArch64FastISel final : public FastISel { + class Address { + public: + typedef enum { + RegBase, + FrameIndexBase + } BaseKind; + + private: + BaseKind Kind; + AArch64_AM::ShiftExtendType ExtType; + union { + unsigned Reg; + int FI; + } Base; + unsigned OffsetReg; + unsigned Shift; + int64_t Offset; + const GlobalValue *GV; + + public: + Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend), + OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; } + void setKind(BaseKind K) { Kind = K; } + BaseKind getKind() const { return Kind; } + void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; } + AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; } + bool isRegBase() const { return Kind == RegBase; } + bool isFIBase() const { return Kind == FrameIndexBase; } + void setReg(unsigned Reg) { + assert(isRegBase() && "Invalid base register access!"); + Base.Reg = Reg; + } + unsigned getReg() const { + assert(isRegBase() && "Invalid base register access!"); + return Base.Reg; + } + void setOffsetReg(unsigned Reg) { + OffsetReg = Reg; + } + unsigned getOffsetReg() const { + return OffsetReg; + } + void setFI(unsigned FI) { + assert(isFIBase() && "Invalid base frame index access!"); + Base.FI = FI; + } + unsigned getFI() const { + assert(isFIBase() && "Invalid base frame index access!"); + return Base.FI; + } + void setOffset(int64_t O) { Offset = O; } + int64_t getOffset() { return Offset; } + void setShift(unsigned S) { Shift = S; } + unsigned getShift() { return Shift; } + + void setGlobalValue(const GlobalValue *G) { GV = G; } + const GlobalValue *getGlobalValue() { return GV; } + }; + + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + LLVMContext *Context; + + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; + +private: + // Selection routines. + bool selectAddSub(const Instruction *I); + bool selectLogicalOp(const Instruction *I); + bool selectLoad(const Instruction *I); + bool selectStore(const Instruction *I); + bool selectBranch(const Instruction *I); + bool selectIndirectBr(const Instruction *I); + bool selectCmp(const Instruction *I); + bool selectSelect(const Instruction *I); + bool selectFPExt(const Instruction *I); + bool selectFPTrunc(const Instruction *I); + bool selectFPToInt(const Instruction *I, bool Signed); + bool selectIntToFP(const Instruction *I, bool Signed); + bool selectRem(const Instruction *I, unsigned ISDOpcode); + bool selectRet(const Instruction *I); + bool selectTrunc(const Instruction *I); + bool selectIntExt(const Instruction *I); + bool selectMul(const Instruction *I); + bool selectShift(const Instruction *I); + bool selectBitCast(const Instruction *I); + bool selectFRem(const Instruction *I); + bool selectSDiv(const Instruction *I); + bool selectGetElementPtr(const Instruction *I); + + // Utility helper routines. + bool isTypeLegal(Type *Ty, MVT &VT); + bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false); + bool isValueAvailable(const Value *V) const; + bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr); + bool computeCallAddress(const Value *V, Address &Addr); + bool simplifyAddress(Address &Addr, MVT VT); + void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, + unsigned Flags, unsigned ScaleFactor, + MachineMemOperand *MMO); + bool isMemCpySmall(uint64_t Len, unsigned Alignment); + bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, + unsigned Alignment); + bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I, + const Value *Cond); + bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT); + bool optimizeSelect(const SelectInst *SI); + std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx); + + // Emit helper routines. + unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, + const Value *RHS, bool SetFlags = false, + bool WantResult = true, bool IsZExt = false); + unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + bool SetFlags = false, bool WantResult = true); + unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm, bool SetFlags = false, + bool WantResult = true); + unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool SetFlags = false, + bool WantResult = true); + unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ExtType, + uint64_t ShiftImm, bool SetFlags = false, + bool WantResult = true); + + // Emit functions. + bool emitCompareAndBranch(const BranchInst *BI); + bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt); + bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt); + bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); + bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS); + unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true, + MachineMemOperand *MMO = nullptr); + bool emitStore(MVT VT, unsigned SrcReg, Address Addr, + MachineMemOperand *MMO = nullptr); + unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags = false, bool WantResult = true, + bool IsZExt = false); + unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm); + unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags = false, bool WantResult = true, + bool IsZExt = false); + unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, bool WantResult = true); + unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, + bool WantResult = true); + unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS, + const Value *RHS); + unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm); + unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + uint64_t ShiftImm); + unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); + unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = true); + unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = true); + unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = false); + + unsigned materializeInt(const ConstantInt *CI, MVT VT); + unsigned materializeFP(const ConstantFP *CFP, MVT VT); + unsigned materializeGV(const GlobalValue *GV); + + // Call handling routines. +private: + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const; + bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs, + unsigned &NumBytes); + bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes); + +public: + // Backend specific FastISel code. + unsigned fastMaterializeAlloca(const AllocaInst *AI) override; + unsigned fastMaterializeConstant(const Constant *C) override; + unsigned fastMaterializeFloatZero(const ConstantFP* CF) override; + + explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { + Subtarget = + &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget()); + Context = &FuncInfo.Fn->getContext(); + } + + bool fastSelectInstruction(const Instruction *I) override; + +#include "AArch64GenFastISel.inc" +}; + +} // end anonymous namespace + +#include "AArch64GenCallingConv.inc" + +/// \brief Check if the sign-/zero-extend will be a noop. +static bool isIntExtFree(const Instruction *I) { + assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) && + "Unexpected integer extend instruction."); + assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() && + "Unexpected value type."); + bool IsZExt = isa<ZExtInst>(I); + + if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0))) + if (LI->hasOneUse()) + return true; + + if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) + return true; + + return false; +} + +/// \brief Determine the implicit scale factor that is applied by a memory +/// operation for a given value type. +static unsigned getImplicitScaleFactor(MVT VT) { + switch (VT.SimpleTy) { + default: + return 0; // invalid + case MVT::i1: // fall-through + case MVT::i8: + return 1; + case MVT::i16: + return 2; + case MVT::i32: // fall-through + case MVT::f32: + return 4; + case MVT::i64: // fall-through + case MVT::f64: + return 8; + } +} + +CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { + if (CC == CallingConv::WebKit_JS) + return CC_AArch64_WebKit_JS; + if (CC == CallingConv::GHC) + return CC_AArch64_GHC; + return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; +} + +unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) { + assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 && + "Alloca should always return a pointer."); + + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) + return 0; + + DenseMap<const AllocaInst *, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + if (SI != FuncInfo.StaticAllocaMap.end()) { + unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addFrameIndex(SI->second) + .addImm(0) + .addImm(0); + return ResultReg; + } + + return 0; +} + +unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) + return 0; + + if (!CI->isZero()) + return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue()); + + // Create a copy from the zero register to materialize a "0" value. + const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass + : &AArch64::GPR32RegClass; + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), + ResultReg).addReg(ZeroReg, getKillRegState(true)); + return ResultReg; +} + +unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { + // Positive zero (+0.0) has to be materialized with a fmov from the zero + // register, because the immediate version of fmov cannot encode zero. + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + const APFloat Val = CFP->getValueAPF(); + bool Is64Bit = (VT == MVT::f64); + // This checks to see if we can use FMOV instructions to materialize + // a constant, otherwise we have to materialize via the constant pool. + if (TLI.isFPImmLegal(Val, VT)) { + int Imm = + Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val); + assert((Imm != -1) && "Cannot encode floating-point constant."); + unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi; + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); + } + + // For the MachO large code model materialize the FP constant in code. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + const TargetRegisterClass *RC = Is64Bit ? + &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg) + .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(TmpReg, getKillRegState(true)); + + return ResultReg; + } + + // Materialize via constant pool. MachineConstantPool wants an explicit + // alignment. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) + Align = DL.getTypeAllocSize(CFP->getType()); + + unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE); + + unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui; + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(ADRPReg) + .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + return ResultReg; +} + +unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { + // We can't handle thread-local variables quickly yet. + if (GV->isThreadLocal()) + return 0; + + // MachO still uses GOT for large code-model accesses, but ELF requires + // movz/movk sequences, which FastISel doesn't handle yet. + if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO()) + return 0; + + unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + EVT DestEVT = TLI.getValueType(DL, GV->getType(), true); + if (!DestEVT.isSimple()) + return 0; + + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + unsigned ResultReg; + + if (OpFlags & AArch64II::MO_GOT) { + // ADRP + LDRX + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE); + + ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), + ResultReg) + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } else if (OpFlags & AArch64II::MO_CONSTPOOL) { + // We can't handle addresses loaded from a constant pool quickly yet. + return 0; + } else { + // ADRP + ADDX + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_PAGE); + + ResultReg = createResultReg(&AArch64::GPR64spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC) + .addImm(0); + } + return ResultReg; +} + +unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(DL, C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return materializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return materializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return materializeGV(GV); + + return 0; +} + +unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) { + assert(CFP->isNullValue() && + "Floating-point constant is not a positive zero."); + MVT VT; + if (!isTypeLegal(CFP->getType(), VT)) + return 0; + + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + bool Is64Bit = (VT == MVT::f64); + unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr; + return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true); +} + +/// \brief Check if the multiply is by a power-of-2 constant. +static bool isMulPowOf2(const Value *I) { + if (const auto *MI = dyn_cast<MulOperator>(I)) { + if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0))) + if (C->getValue().isPowerOf2()) + return true; + if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1))) + if (C->getValue().isPowerOf2()) + return true; + } + return false; +} + +// Computes the address to get to an object. +bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) +{ + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + if (auto *Ty = dyn_cast<PointerType>(Obj->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: + break; + case Instruction::BitCast: { + // Look through bitcasts. + return computeAddress(U->getOperand(0), Addr, Ty); + } + case Instruction::IntToPtr: { + // Look past no-op inttoptrs. + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return computeAddress(U->getOperand(0), Addr, Ty); + break; + } + case Instruction::PtrToInt: { + // Look past no-op ptrtoints. + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return computeAddress(U->getOperand(0), Addr, Ty); + break; + } + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + uint64_t TmpOffset = Addr.getOffset(); + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; + ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.setOffset(TmpOffset); + if (computeAddress(U->getOperand(0), Addr, Ty)) + return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst *, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.setKind(Address::FrameIndexBase); + Addr.setFI(SI->second); + return true; + } + break; + } + case Instruction::Add: { + // Adds of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (isa<ConstantInt>(LHS)) + std::swap(LHS, RHS); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + Addr.setOffset(Addr.getOffset() + CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + + Address Backup = Addr; + if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty)) + return true; + Addr = Backup; + + break; + } + case Instruction::Sub: { + // Subs of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + Addr.setOffset(Addr.getOffset() - CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + break; + } + case Instruction::Shl: { + if (Addr.getOffsetReg()) + break; + + const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1)); + if (!CI) + break; + + unsigned Val = CI->getZExtValue(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = U->getOperand(0); + if (const auto *I = dyn_cast<Instruction>(Src)) { + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(I)) { + if (!isIntExtFree(ZE) && + ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(I)) { + if (!isIntExtFree(SE) && + SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + } + } + + if (const auto *AI = dyn_cast<BinaryOperator>(Src)) + if (AI->getOpcode() == Instruction::And) { + const Value *LHS = AI->getOperand(0); + const Value *RHS = AI->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setExtendType(AArch64_AM::UXTW); + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::Mul: { + if (Addr.getOffsetReg()) + break; + + if (!isMulPowOf2(U)) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + // Canonicalize power-of-2 value to the RHS. + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue().isPowerOf2()) + std::swap(LHS, RHS); + + assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt."); + const auto *C = cast<ConstantInt>(RHS); + unsigned Val = C->getValue().logBase2(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = LHS; + if (const auto *I = dyn_cast<Instruction>(Src)) { + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(I)) { + if (!isIntExtFree(ZE) && + ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(I)) { + if (!isIntExtFree(SE) && + SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + } + } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::And: { + if (Addr.getOffsetReg()) + break; + + if (!Ty || DL.getTypeSizeInBits(Ty) != 8) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::LSL); + Addr.setExtendType(AArch64_AM::UXTW); + + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + break; + } + case Instruction::SExt: + case Instruction::ZExt: { + if (!Addr.getReg() || Addr.getOffsetReg()) + break; + + const Value *Src = nullptr; + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(U)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(U)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + + if (!Src) + break; + + Addr.setShift(0); + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + } // end switch + + if (Addr.isRegBase() && !Addr.getReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setReg(Reg); + return true; + } + + if (!Addr.getOffsetReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + + return false; +} + +bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + bool InMBB = true; + + if (const auto *I = dyn_cast<Instruction>(V)) { + Opcode = I->getOpcode(); + U = I; + InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); + } else if (const auto *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts if its operand is in the same BB. + if (InMBB) + return computeCallAddress(U->getOperand(0), Addr); + break; + case Instruction::IntToPtr: + // Look past no-op inttoptrs if its operand is in the same BB. + if (InMBB && + TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return computeCallAddress(U->getOperand(0), Addr); + break; + case Instruction::PtrToInt: + // Look past no-op ptrtoints if its operand is in the same BB. + if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return computeCallAddress(U->getOperand(0), Addr); + break; + } + + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + Addr.setGlobalValue(GV); + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!Addr.getGlobalValue()) { + Addr.setReg(getRegForValue(V)); + return Addr.getReg() != 0; + } + + return false; +} + + +bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT evt = TLI.getValueType(DL, Ty, true); + + // Only handle simple types. + if (evt == MVT::Other || !evt.isSimple()) + return false; + VT = evt.getSimpleVT(); + + // This is a legal type, but it's not something we handle in fast-isel. + if (VT == MVT::f128) + return false; + + // Handle all other legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +/// \brief Determine if the value type is supported by FastISel. +/// +/// FastISel for AArch64 can handle more value types than are legal. This adds +/// simple value type such as i1, i8, and i16. +bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) { + if (Ty->isVectorTy() && !IsVectorAllowed) + return false; + + if (isTypeLegal(Ty, VT)) + return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) + return true; + + return false; +} + +bool AArch64FastISel::isValueAvailable(const Value *V) const { + if (!isa<Instruction>(V)) + return true; + + const auto *I = cast<Instruction>(V); + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + return true; + + return false; +} + +bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + return false; + + bool ImmediateOffsetNeedsLowering = false; + bool RegisterOffsetNeedsLowering = false; + int64_t Offset = Addr.getOffset(); + if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset)) + ImmediateOffsetNeedsLowering = true; + else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) && + !isUInt<12>(Offset / ScaleFactor)) + ImmediateOffsetNeedsLowering = true; + + // Cannot encode an offset register and an immediate offset in the same + // instruction. Fold the immediate offset into the load/store instruction and + // emit an additional add to take care of the offset register. + if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) + RegisterOffsetNeedsLowering = true; + + // Cannot encode zero register as base. + if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg()) + RegisterOffsetNeedsLowering = true; + + // If this is a stack pointer and the offset needs to be simplified then put + // the alloca address into a register, set the base type back to register and + // continue. This should almost never happen. + if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase()) + { + unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addFrameIndex(Addr.getFI()) + .addImm(0) + .addImm(0); + Addr.setKind(Address::RegBase); + Addr.setReg(ResultReg); + } + + if (RegisterOffsetNeedsLowering) { + unsigned ResultReg = 0; + if (Addr.getReg()) { + if (Addr.getExtendType() == AArch64_AM::SXTW || + Addr.getExtendType() == AArch64_AM::UXTW ) + ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(), + /*TODO:IsKill=*/false, Addr.getOffsetReg(), + /*TODO:IsKill=*/false, Addr.getExtendType(), + Addr.getShift()); + else + ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(), + /*TODO:IsKill=*/false, Addr.getOffsetReg(), + /*TODO:IsKill=*/false, AArch64_AM::LSL, + Addr.getShift()); + } else { + if (Addr.getExtendType() == AArch64_AM::UXTW) + ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift(), + /*IsZExt=*/true); + else if (Addr.getExtendType() == AArch64_AM::SXTW) + ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift(), + /*IsZExt=*/false); + else + ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift()); + } + if (!ResultReg) + return false; + + Addr.setReg(ResultReg); + Addr.setOffsetReg(0); + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::InvalidShiftExtend); + } + + // Since the offset is too large for the load/store instruction get the + // reg+offset into a register. + if (ImmediateOffsetNeedsLowering) { + unsigned ResultReg; + if (Addr.getReg()) + // Try to fold the immediate into the add instruction. + ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset); + else + ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset); + + if (!ResultReg) + return false; + Addr.setReg(ResultReg); + Addr.setOffset(0); + } + return true; +} + +void AArch64FastISel::addLoadStoreOperands(Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags, + unsigned ScaleFactor, + MachineMemOperand *MMO) { + int64_t Offset = Addr.getOffset() / ScaleFactor; + // Frame base works a bit differently. Handle it separately. + if (Addr.isFIBase()) { + int FI = Addr.getFI(); + // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size + // and alignment should be based on the VT. + MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + // Now add the rest of the operands. + MIB.addFrameIndex(FI).addImm(Offset); + } else { + assert(Addr.isRegBase() && "Unexpected address kind."); + const MCInstrDesc &II = MIB->getDesc(); + unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0; + Addr.setReg( + constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx)); + Addr.setOffsetReg( + constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1)); + if (Addr.getOffsetReg()) { + assert(Addr.getOffset() == 0 && "Unexpected offset"); + bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW || + Addr.getExtendType() == AArch64_AM::SXTX; + MIB.addReg(Addr.getReg()); + MIB.addReg(Addr.getOffsetReg()); + MIB.addImm(IsSigned); + MIB.addImm(Addr.getShift() != 0); + } else + MIB.addReg(Addr.getReg()).addImm(Offset); + } + + if (MMO) + MIB.addMemOperand(MMO); +} + +unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, + const Value *RHS, bool SetFlags, + bool WantResult, bool IsZExt) { + AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend; + bool NeedExtend = false; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + NeedExtend = true; + break; + case MVT::i8: + NeedExtend = true; + ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB; + break; + case MVT::i16: + NeedExtend = true; + ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH; + break; + case MVT::i32: // fall-through + case MVT::i64: + break; + } + MVT SrcVT = RetVT; + RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32); + + // Canonicalize immediates to the RHS first. + if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS)) + std::swap(LHS, RHS); + + // Canonicalize mul by power of 2 to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + + // Canonicalize shift immediate to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast<BinaryOperator>(LHS)) + if (isa<ConstantInt>(SI->getOperand(1))) + if (SI->getOpcode() == Instruction::Shl || + SI->getOpcode() == Instruction::LShr || + SI->getOpcode() == Instruction::AShr ) + std::swap(LHS, RHS); + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return 0; + bool LHSIsKill = hasTrivialKill(LHS); + + if (NeedExtend) + LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt); + + unsigned ResultReg = 0; + if (const auto *C = dyn_cast<ConstantInt>(RHS)) { + uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue(); + if (C->isNegative()) + ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm, + SetFlags, WantResult); + else + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags, + WantResult); + } else if (const auto *C = dyn_cast<Constant>(RHS)) + if (C->isNullValue()) + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags, + WantResult); + + if (ResultReg) + return ResultReg; + + // Only extend the RHS within the instruction if there is a valid extend type. + if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() && + isValueAvailable(RHS)) { + if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) + if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ExtendType, C->getZExtValue(), + SetFlags, WantResult); + } + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + ExtendType, 0, SetFlags, WantResult); + } + + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) { + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); + const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; + } + } + + // Check if the shift can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) { + if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) { + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { + AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; + switch (SI->getOpcode()) { + default: break; + case Instruction::Shl: ShiftType = AArch64_AM::LSL; break; + case Instruction::LShr: ShiftType = AArch64_AM::LSR; break; + case Instruction::AShr: ShiftType = AArch64_AM::ASR; break; + } + uint64_t ShiftVal = C->getZExtValue(); + if (ShiftType != AArch64_AM::InvalidShiftExtend) { + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; + } + } + } + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + + if (NeedExtend) + RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt); + + return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + SetFlags, WantResult); +} + +unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrr, AArch64::SUBXrr }, + { AArch64::ADDWrr, AArch64::ADDXrr } }, + { { AArch64::SUBSWrr, AArch64::SUBSXrr }, + { AArch64::ADDSWrr, AArch64::ADDSXrr } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm, + bool SetFlags, bool WantResult) { + assert(LHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + unsigned ShiftImm; + if (isUInt<12>(Imm)) + ShiftImm = 0; + else if ((Imm & 0xfff000) == Imm) { + ShiftImm = 12; + Imm >>= 12; + } else + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWri, AArch64::SUBXri }, + { AArch64::ADDWri, AArch64::ADDXri } }, + { { AArch64::SUBSWri, AArch64::SUBSXri }, + { AArch64::ADDSWri, AArch64::ADDSXri } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC; + if (SetFlags) + RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + else + RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addImm(Imm) + .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrs, AArch64::SUBXrs }, + { AArch64::ADDWrs, AArch64::ADDXrs } }, + { { AArch64::SUBSWrs, AArch64::SUBSXrs }, + { AArch64::ADDSWrs, AArch64::ADDSXrs } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addImm(getShifterImm(ShiftType, ShiftImm)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ExtType, + uint64_t ShiftImm, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + if (ShiftImm >= 4) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrx, AArch64::SUBXrx }, + { AArch64::ADDWrx, AArch64::ADDXrx } }, + { { AArch64::SUBSWrx, AArch64::SUBSXrx }, + { AArch64::ADDSWrx, AArch64::ADDSXrx } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = nullptr; + if (SetFlags) + RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + else + RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addImm(getArithExtendImm(ExtType, ShiftImm)); + return ResultReg; +} + +bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) { + Type *Ty = LHS->getType(); + EVT EVT = TLI.getValueType(DL, Ty, true); + if (!EVT.isSimple()) + return false; + MVT VT = EVT.getSimpleVT(); + + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + return emitICmp(VT, LHS, RHS, IsZExt); + case MVT::f32: + case MVT::f64: + return emitFCmp(VT, LHS, RHS); + } +} + +bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, + bool IsZExt) { + return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false, + IsZExt) != 0; +} + +bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm, + /*SetFlags=*/true, /*WantResult=*/false) != 0; +} + +bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) { + if (RetVT != MVT::f32 && RetVT != MVT::f64) + return false; + + // Check to see if the 2nd operand is a constant that we can encode directly + // in the compare. + bool UseImm = false; + if (const auto *CFP = dyn_cast<ConstantFP>(RHS)) + if (CFP->isZero() && !CFP->isNegative()) + UseImm = true; + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + if (UseImm) { + unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + return true; + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)); + return true; +} + +unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags, bool WantResult, bool IsZExt) { + return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult, + IsZExt); +} + +/// \brief This method is a wrapper to simplify add emission. +/// +/// First try to emit an add with an immediate operand using emitAddSub_ri. If +/// that fails, then try to materialize the immediate into a register and use +/// emitAddSub_rr instead. +unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, + int64_t Imm) { + unsigned ResultReg; + if (Imm < 0) + ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm); + else + ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm); + + if (ResultReg) + return ResultReg; + + unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm); + if (!CReg) + return 0; + + ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true); + return ResultReg; +} + +unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags, bool WantResult, bool IsZExt) { + return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult, + IsZExt); +} + +unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, bool WantResult) { + return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, /*SetFlags=*/true, WantResult); +} + +unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool WantResult) { + return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true, + WantResult); +} + +unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, + const Value *LHS, const Value *RHS) { + // Canonicalize immediates to the RHS first. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS)) + std::swap(LHS, RHS); + + // Canonicalize mul by power-of-2 to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + + // Canonicalize shift immediate to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast<ShlOperator>(LHS)) + if (isa<ConstantInt>(SI->getOperand(1))) + std::swap(LHS, RHS); + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return 0; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned ResultReg = 0; + if (const auto *C = dyn_cast<ConstantInt>(RHS)) { + uint64_t Imm = C->getZExtValue(); + ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm); + } + if (ResultReg) + return ResultReg; + + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) { + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); + const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); + + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; + } + } + + // Check if the shift can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) { + if (const auto *SI = dyn_cast<ShlOperator>(RHS)) + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { + uint64_t ShiftVal = C->getZExtValue(); + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; + } + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + + MVT VT = std::max(MVT::i32, RetVT.SimpleTy); + ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, + unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && + "ISD nodes are not consecutive!"); + static const unsigned OpcTable[3][2] = { + { AArch64::ANDWri, AArch64::ANDXri }, + { AArch64::ORRWri, AArch64::ORRXri }, + { AArch64::EORWri, AArch64::EORXri } + }; + const TargetRegisterClass *RC; + unsigned Opc; + unsigned RegSize; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: { + unsigned Idx = ISDOpc - ISD::AND; + Opc = OpcTable[Idx][0]; + RC = &AArch64::GPR32spRegClass; + RegSize = 32; + break; + } + case MVT::i64: + Opc = OpcTable[ISDOpc - ISD::AND][1]; + RC = &AArch64::GPR64spRegClass; + RegSize = 64; + break; + } + + if (!AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return 0; + + unsigned ResultReg = + fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill, + AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, + unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, + uint64_t ShiftImm) { + assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && + "ISD nodes are not consecutive!"); + static const unsigned OpcTable[3][2] = { + { AArch64::ANDWrs, AArch64::ANDXrs }, + { AArch64::ORRWrs, AArch64::ORRXrs }, + { AArch64::EORWrs, AArch64::EORXrs } + }; + + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + + const TargetRegisterClass *RC; + unsigned Opc; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + Opc = OpcTable[ISDOpc - ISD::AND][0]; + RC = &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = OpcTable[ISDOpc - ISD::AND][1]; + RC = &AArch64::GPR64RegClass; + break; + } + unsigned ResultReg = + fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm)); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm); +} + +unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, + bool WantZExt, MachineMemOperand *MMO) { + if (!TLI.allowsMisalignedMemoryAccesses(VT)) + return 0; + + // Simplify this down to something we can handle. + if (!simplifyAddress(Addr, VT)) + return 0; + + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); + + // Negative offsets require unscaled, 9-bit, signed immediate offsets. + // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. + bool UseScaled = true; + if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) { + UseScaled = false; + ScaleFactor = 1; + } + + static const unsigned GPOpcTable[2][8][4] = { + // Sign-extend. + { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi, + AArch64::LDURXi }, + { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui, + AArch64::LDRXui }, + { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW, + AArch64::LDRXroW } + }, + // Zero-extend. + { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW } + } + }; + + static const unsigned FPOpcTable[4][2] = { + { AArch64::LDURSi, AArch64::LDURDi }, + { AArch64::LDRSui, AArch64::LDRDui }, + { AArch64::LDRSroX, AArch64::LDRDroX }, + { AArch64::LDRSroW, AArch64::LDRDroW } + }; + + unsigned Opc; + const TargetRegisterClass *RC; + bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && + Addr.getOffsetReg(); + unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; + if (Addr.getExtendType() == AArch64_AM::UXTW || + Addr.getExtendType() == AArch64_AM::SXTW) + Idx++; + + bool IsRet64Bit = RetVT == MVT::i64; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Unexpected value type."); + case MVT::i1: // Intentional fall-through. + case MVT::i8: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i16: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i32: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3]; + RC = &AArch64::GPR64RegClass; + break; + case MVT::f32: + Opc = FPOpcTable[Idx][0]; + RC = &AArch64::FPR32RegClass; + break; + case MVT::f64: + Opc = FPOpcTable[Idx][1]; + RC = &AArch64::FPR64RegClass; + break; + } + + // Create the base instruction, then add the operands. + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO); + + // Loading an i1 requires special handling. + if (VT == MVT::i1) { + unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1); + assert(ANDReg && "Unexpected AND instruction emission failure."); + ResultReg = ANDReg; + } + + // For zero-extending loads to 64bit we emit a 32bit load and then convert + // the 32bit reg to a 64bit reg. + if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg, getKillRegState(true)) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; +} + +bool AArch64FastISel::selectAddSub(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) + return false; + + if (VT.isVector()) + return selectOperator(I, I->getOpcode()); + + unsigned ResultReg; + switch (I->getOpcode()) { + default: + llvm_unreachable("Unexpected instruction."); + case Instruction::Add: + ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Sub: + ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1)); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectLogicalOp(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) + return false; + + if (VT.isVector()) + return selectOperator(I, I->getOpcode()); + + unsigned ResultReg; + switch (I->getOpcode()) { + default: + llvm_unreachable("Unexpected instruction."); + case Instruction::And: + ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Or: + ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Xor: + ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1)); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectLoad(const Instruction *I) { + MVT VT; + // Verify we have a legal type before going any further. Currently, we handle + // simple types that will directly fit in a register (i32/f32/i64/f64) or + // those that can be sign or zero-extended to a basic operation (i1/i8/i16). + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) || + cast<LoadInst>(I)->isAtomic()) + return false; + + // See if we can handle this address. + Address Addr; + if (!computeAddress(I->getOperand(0), Addr, I->getType())) + return false; + + // Fold the following sign-/zero-extend into the load instruction. + bool WantZExt = true; + MVT RetVT = VT; + const Value *IntExtVal = nullptr; + if (I->hasOneUse()) { + if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) { + if (isTypeSupported(ZE->getType(), RetVT)) + IntExtVal = ZE; + else + RetVT = VT; + } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) { + if (isTypeSupported(SE->getType(), RetVT)) + IntExtVal = SE; + else + RetVT = VT; + WantZExt = false; + } + } + + unsigned ResultReg = + emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I)); + if (!ResultReg) + return false; + + // There are a few different cases we have to handle, because the load or the + // sign-/zero-extend might not be selected by FastISel if we fall-back to + // SelectionDAG. There is also an ordering issue when both instructions are in + // different basic blocks. + // 1.) The load instruction is selected by FastISel, but the integer extend + // not. This usually happens when the integer extend is in a different + // basic block and SelectionDAG took over for that basic block. + // 2.) The load instruction is selected before the integer extend. This only + // happens when the integer extend is in a different basic block. + // 3.) The load instruction is selected by SelectionDAG and the integer extend + // by FastISel. This happens if there are instructions between the load + // and the integer extend that couldn't be selected by FastISel. + if (IntExtVal) { + // The integer extend hasn't been emitted yet. FastISel or SelectionDAG + // could select it. Emit a copy to subreg if necessary. FastISel will remove + // it when it selects the integer extend. + unsigned Reg = lookUpRegForValue(IntExtVal); + auto *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) { + if (RetVT == MVT::i64 && VT <= MVT::i32) { + if (WantZExt) { + // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG). + std::prev(FuncInfo.InsertPt)->eraseFromParent(); + ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg(); + } else + ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, + /*IsKill=*/true, + AArch64::sub_32); + } + updateValueMap(I, ResultReg); + return true; + } + + // The integer extend has already been emitted - delete all the instructions + // that have been emitted by the integer extend lowering code and use the + // result from the load instruction directly. + while (MI) { + Reg = 0; + for (auto &Opnd : MI->uses()) { + if (Opnd.isReg()) { + Reg = Opnd.getReg(); + break; + } + } + MI->eraseFromParent(); + MI = nullptr; + if (Reg) + MI = MRI.getUniqueVRegDef(Reg); + } + updateValueMap(IntExtVal, ResultReg); + return true; + } + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, + MachineMemOperand *MMO) { + if (!TLI.allowsMisalignedMemoryAccesses(VT)) + return false; + + // Simplify this down to something we can handle. + if (!simplifyAddress(Addr, VT)) + return false; + + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); + + // Negative offsets require unscaled, 9-bit, signed immediate offsets. + // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. + bool UseScaled = true; + if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) { + UseScaled = false; + ScaleFactor = 1; + } + + static const unsigned OpcTable[4][6] = { + { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi, + AArch64::STURSi, AArch64::STURDi }, + { AArch64::STRBBui, AArch64::STRHHui, AArch64::STRWui, AArch64::STRXui, + AArch64::STRSui, AArch64::STRDui }, + { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX, + AArch64::STRSroX, AArch64::STRDroX }, + { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW, + AArch64::STRSroW, AArch64::STRDroW } + }; + + unsigned Opc; + bool VTIsi1 = false; + bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && + Addr.getOffsetReg(); + unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; + if (Addr.getExtendType() == AArch64_AM::UXTW || + Addr.getExtendType() == AArch64_AM::SXTW) + Idx++; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i1: VTIsi1 = true; + case MVT::i8: Opc = OpcTable[Idx][0]; break; + case MVT::i16: Opc = OpcTable[Idx][1]; break; + case MVT::i32: Opc = OpcTable[Idx][2]; break; + case MVT::i64: Opc = OpcTable[Idx][3]; break; + case MVT::f32: Opc = OpcTable[Idx][4]; break; + case MVT::f64: Opc = OpcTable[Idx][5]; break; + } + + // Storing an i1 requires special handling. + if (VTIsi1 && SrcReg != AArch64::WZR) { + unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + assert(ANDReg && "Unexpected AND instruction emission failure."); + SrcReg = ANDReg; + } + // Create the base instruction, then add the operands. + const MCInstrDesc &II = TII.get(Opc); + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg); + addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO); + + return true; +} + +bool AArch64FastISel::selectStore(const Instruction *I) { + MVT VT; + const Value *Op0 = I->getOperand(0); + // Verify we have a legal type before going any further. Currently, we handle + // simple types that will directly fit in a register (i32/f32/i64/f64) or + // those that can be sign or zero-extended to a basic operation (i1/i8/i16). + if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true) || + cast<StoreInst>(I)->isAtomic()) + return false; + + // Get the value to be stored into a register. Use the zero register directly + // when possible to avoid an unnecessary copy and a wasted register. + unsigned SrcReg = 0; + if (const auto *CI = dyn_cast<ConstantInt>(Op0)) { + if (CI->isZero()) + SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) { + if (CF->isZero() && !CF->isNegative()) { + VT = MVT::getIntegerVT(VT.getSizeInBits()); + SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + } + } + + if (!SrcReg) + SrcReg = getRegForValue(Op0); + + if (!SrcReg) + return false; + + // See if we can handle this address. + Address Addr; + if (!computeAddress(I->getOperand(1), Addr, I->getOperand(0)->getType())) + return false; + + if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I))) + return false; + return true; +} + +static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { + switch (Pred) { + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return AArch64CC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return AArch64CC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return AArch64CC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return AArch64CC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return AArch64CC::HI; + case CmpInst::FCMP_OLT: + return AArch64CC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return AArch64CC::LS; + case CmpInst::FCMP_ORD: + return AArch64CC::VC; + case CmpInst::FCMP_UNO: + return AArch64CC::VS; + case CmpInst::FCMP_UGE: + return AArch64CC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return AArch64CC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return AArch64CC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return AArch64CC::NE; + case CmpInst::ICMP_UGE: + return AArch64CC::HS; + case CmpInst::ICMP_ULT: + return AArch64CC::LO; + } +} + +/// \brief Try to emit a combined compare-and-branch instruction. +bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { + assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction"); + const CmpInst *CI = cast<CmpInst>(BI->getCondition()); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + MVT VT; + if (!isTypeSupported(LHS->getType(), VT)) + return false; + + unsigned BW = VT.getSizeInBits(); + if (BW > 64) + return false; + + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + int TestBit = -1; + bool IsCmpNE; + switch (Predicate) { + default: + return false; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue()) + std::swap(LHS, RHS); + + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) + return false; + + if (const auto *AI = dyn_cast<BinaryOperator>(LHS)) + if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) { + const Value *AndLHS = AI->getOperand(0); + const Value *AndRHS = AI->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(AndLHS)) + if (C->getValue().isPowerOf2()) + std::swap(AndLHS, AndRHS); + + if (const auto *C = dyn_cast<ConstantInt>(AndRHS)) + if (C->getValue().isPowerOf2()) { + TestBit = C->getValue().logBase2(); + LHS = AndLHS; + } + } + + if (VT == MVT::i1) + TestBit = 0; + + IsCmpNE = Predicate == CmpInst::ICMP_NE; + break; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLT; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SLE: + if (!isa<ConstantInt>(RHS)) + return false; + + if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true)) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLE; + break; + } // end switch + + static const unsigned OpcTable[2][2][2] = { + { {AArch64::CBZW, AArch64::CBZX }, + {AArch64::CBNZW, AArch64::CBNZX} }, + { {AArch64::TBZW, AArch64::TBZX }, + {AArch64::TBNZW, AArch64::TBNZX} } + }; + + bool IsBitTest = TestBit != -1; + bool Is64Bit = BW == 64; + if (TestBit < 32 && TestBit >= 0) + Is64Bit = false; + + unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit]; + const MCInstrDesc &II = TII.get(Opc); + + unsigned SrcReg = getRegForValue(LHS); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(LHS); + + if (BW == 64 && !Is64Bit) + SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + AArch64::sub_32); + + if ((BW < 32) && !IsBitTest) + SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true); + + // Emit the combined compare and branch instruction. + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + if (IsBitTest) + MIB.addImm(TestBit); + MIB.addMBB(TBB); + + finishCondBranch(BI->getParent(), TBB, FBB); + return true; +} + +bool AArch64FastISel::selectBranch(const Instruction *I) { + const BranchInst *BI = cast<BranchInst>(I); + if (BI->isUnconditional()) { + MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)]; + fastEmitBranch(MSucc, BI->getDebugLoc()); + return true; + } + + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && isValueAvailable(CI)) { + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + fastEmitBranch(FBB, DbgLoc); + return true; + case CmpInst::FCMP_TRUE: + fastEmitBranch(TBB, DbgLoc); + return true; + } + + // Try to emit a combined compare-and-branch first. + if (emitCompareAndBranch(BI)) + return true; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + // Emit the cmp. + if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch + // instruction. + AArch64CC::CondCode CC = getCompareCC(Predicate); + AArch64CC::CondCode ExtraCC = AArch64CC::AL; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + + // Emit the extra branch for FCMP_UEQ and FCMP_ONE. + if (ExtraCC != AArch64CC::AL) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(ExtraCC) + .addMBB(TBB); + } + + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } + } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { + uint64_t Imm = CI->getZExtValue(); + MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) + .addMBB(Target); + + // Obtain the branch probability and add the target to the successor list. + if (FuncInfo.BPI) { + auto BranchProbability = FuncInfo.BPI->getEdgeProbability( + BI->getParent(), Target->getBasicBlock()); + FuncInfo.MBB->addSuccessor(Target, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(Target); + return true; + } else { + AArch64CC::CondCode CC = AArch64CC::NE; + if (foldXALUIntrinsic(CC, I, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned CondReg = getRegForValue(BI->getCondition()); + if (!CondReg) + return false; + + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } + } + + unsigned CondReg = getRegForValue(BI->getCondition()); + if (CondReg == 0) + return false; + bool CondRegIsKill = hasTrivialKill(BI->getCondition()); + + // i1 conditions come as i32 values, test the lowest bit with tb(n)z. + unsigned Opcode = AArch64::TBNZW; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Opcode = AArch64::TBZW; + } + + const MCInstrDesc &II = TII.get(Opcode); + unsigned ConstrainedCondReg + = constrainOperandRegClass(II, CondReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill)) + .addImm(0) + .addMBB(TBB); + + finishCondBranch(BI->getParent(), TBB, FBB); + return true; +} + +bool AArch64FastISel::selectIndirectBr(const Instruction *I) { + const IndirectBrInst *BI = cast<IndirectBrInst>(I); + unsigned AddrReg = getRegForValue(BI->getOperand(0)); + if (AddrReg == 0) + return false; + + // Emit the indirect branch. + const MCInstrDesc &II = TII.get(AArch64::BR); + AddrReg = constrainOperandRegClass(II, AddrReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg); + + // Make sure the CFG is up-to-date. + for (auto *Succ : BI->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]); + + return true; +} + +bool AArch64FastISel::selectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + // Vectors of i1 are weird: bail out. + if (CI->getType()->isVectorTy()) + return false; + + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(AArch64::WZR, getKillRegState(true)); + break; + case CmpInst::FCMP_TRUE: + ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1); + break; + } + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + + // Emit the cmp. + if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + ResultReg = createResultReg(&AArch64::GPR32RegClass); + + // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These + // condition codes are inverted, because they are used by CSINC. + static unsigned CondCodeTable[2][2] = { + { AArch64CC::NE, AArch64CC::VC }, + { AArch64CC::PL, AArch64CC::LE } + }; + unsigned *CondCodes = nullptr; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + CondCodes = &CondCodeTable[0][0]; + break; + case CmpInst::FCMP_ONE: + CondCodes = &CondCodeTable[1][0]; + break; + } + + if (CondCodes) { + unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + TmpReg1) + .addReg(AArch64::WZR, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) + .addImm(CondCodes[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + ResultReg) + .addReg(TmpReg1, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) + .addImm(CondCodes[1]); + + updateValueMap(I, ResultReg); + return true; + } + + // Now set a register based on the comparison. + AArch64CC::CondCode CC = getCompareCC(Predicate); + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + AArch64CC::CondCode invertedCC = getInvertedCondCode(CC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + ResultReg) + .addReg(AArch64::WZR, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) + .addImm(invertedCC); + + updateValueMap(I, ResultReg); + return true; +} + +/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false' +/// value. +bool AArch64FastISel::optimizeSelect(const SelectInst *SI) { + if (!SI->getType()->isIntegerTy(1)) + return false; + + const Value *Src1Val, *Src2Val; + unsigned Opc = 0; + bool NeedExtraOp = false; + if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getFalseValue(); + Opc = AArch64::ORRWrr; + } else { + assert(CI->isZero()); + Src1Val = SI->getFalseValue(); + Src2Val = SI->getCondition(); + Opc = AArch64::BICWrr; + } + } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ORRWrr; + NeedExtraOp = true; + } else { + assert(CI->isZero()); + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ANDWrr; + } + } + + if (!Opc) + return false; + + unsigned Src1Reg = getRegForValue(Src1Val); + if (!Src1Reg) + return false; + bool Src1IsKill = hasTrivialKill(Src1Val); + + unsigned Src2Reg = getRegForValue(Src2Val); + if (!Src2Reg) + return false; + bool Src2IsKill = hasTrivialKill(Src2Val); + + if (NeedExtraOp) { + Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1); + Src1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg, + Src1IsKill, Src2Reg, Src2IsKill); + updateValueMap(SI, ResultReg); + return true; +} + +bool AArch64FastISel::selectSelect(const Instruction *I) { + assert(isa<SelectInst>(I) && "Expected a select instruction."); + MVT VT; + if (!isTypeSupported(I->getType(), VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + Opc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + break; + case MVT::f32: + Opc = AArch64::FCSELSrrr; + RC = &AArch64::FPR32RegClass; + break; + case MVT::f64: + Opc = AArch64::FCSELDrrr; + RC = &AArch64::FPR64RegClass; + break; + } + + const SelectInst *SI = cast<SelectInst>(I); + const Value *Cond = SI->getCondition(); + AArch64CC::CondCode CC = AArch64CC::NE; + AArch64CC::CondCode ExtraCC = AArch64CC::AL; + + if (optimizeSelect(SI)) + return true; + + // Try to pickup the flags, so we don't have to emit another compare. + if (foldXALUIntrinsic(CC, I, Cond)) { + // Fake request the condition to force emission of the XALU intrinsic. + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() && + isValueAvailable(Cond)) { + const auto *Cmp = cast<CmpInst>(Cond); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp); + const Value *FoldSelect = nullptr; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + FoldSelect = SI->getFalseValue(); + break; + case CmpInst::FCMP_TRUE: + FoldSelect = SI->getTrueValue(); + break; + } + + if (FoldSelect) { + unsigned SrcReg = getRegForValue(FoldSelect); + if (!SrcReg) + return false; + unsigned UseReg = lookUpRegForValue(SI); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; + } + + // Emit the cmp. + if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned())) + return false; + + // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction. + CC = getCompareCC(Predicate); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + } else { + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + const MCInstrDesc &II = TII.get(AArch64::ANDSWri); + CondReg = constrainOperandRegClass(II, CondReg, 1); + + // Emit a TST instruction (ANDS wzr, reg, #imm). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, + AArch64::WZR) + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + } + + unsigned Src1Reg = getRegForValue(SI->getTrueValue()); + bool Src1IsKill = hasTrivialKill(SI->getTrueValue()); + + unsigned Src2Reg = getRegForValue(SI->getFalseValue()); + bool Src2IsKill = hasTrivialKill(SI->getFalseValue()); + + if (!Src1Reg || !Src2Reg) + return false; + + if (ExtraCC != AArch64CC::AL) { + Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, ExtraCC); + Src2IsKill = true; + } + unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, CC); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectFPExt(const Instruction *I) { + Value *V = I->getOperand(0); + if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) + return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) + return false; + + unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr), + ResultReg).addReg(Op); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectFPTrunc(const Instruction *I) { + Value *V = I->getOperand(0); + if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy()) + return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) + return false; + + unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr), + ResultReg).addReg(Op); + updateValueMap(I, ResultReg); + return true; +} + +// FPToUI and FPToSI +bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { + MVT DestVT; + if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) + return false; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (SrcReg == 0) + return false; + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); + if (SrcVT == MVT::f128) + return false; + + unsigned Opc; + if (SrcVT == MVT::f64) { + if (Signed) + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr; + else + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr; + } else { + if (Signed) + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr; + else + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr; + } + unsigned ResultReg = createResultReg( + DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { + MVT DestVT; + if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) + return false; + assert ((DestVT == MVT::f32 || DestVT == MVT::f64) && + "Unexpected value type."); + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true); + + // Handle sign-extension. + if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) { + SrcReg = + emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); + if (!SrcReg) + return false; + SrcIsKill = true; + } + + unsigned Opc; + if (SrcVT == MVT::i64) { + if (Signed) + Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri; + else + Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri; + } else { + if (Signed) + Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri; + else + Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri; + } + + unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg, + SrcIsKill); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::fastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + if (CC != CallingConv::C) + return false; + + // Only handle simple cases of up to 8 GPR and FPR each. + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; + if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || + F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + return false; + + Type *ArgTy = Arg.getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy()) + return false; + + EVT ArgVT = TLI.getValueType(DL, ArgTy); + if (!ArgVT.isSimple()) + return false; + + MVT VT = ArgVT.getSimpleVT().SimpleTy; + if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8()) + return false; + + if (VT.isVector() && + (!Subtarget->hasNEON() || !Subtarget->isLittleEndian())) + return false; + + if (VT >= MVT::i1 && VT <= MVT::i64) + ++GPRCnt; + else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() || + VT.is128BitVector()) + ++FPRCnt; + else + return false; + + if (GPRCnt > 8 || FPRCnt > 8) + return false; + } + + static const MCPhysReg Registers[6][8] = { + { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4, + AArch64::W5, AArch64::W6, AArch64::W7 }, + { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7 }, + { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4, + AArch64::H5, AArch64::H6, AArch64::H7 }, + { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4, + AArch64::S5, AArch64::S6, AArch64::S7 }, + { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4, + AArch64::D5, AArch64::D6, AArch64::D7 }, + { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7 } + }; + + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); + unsigned SrcReg; + const TargetRegisterClass *RC; + if (VT >= MVT::i1 && VT <= MVT::i32) { + SrcReg = Registers[0][GPRIdx++]; + RC = &AArch64::GPR32RegClass; + VT = MVT::i32; + } else if (VT == MVT::i64) { + SrcReg = Registers[1][GPRIdx++]; + RC = &AArch64::GPR64RegClass; + } else if (VT == MVT::f16) { + SrcReg = Registers[2][FPRIdx++]; + RC = &AArch64::FPR16RegClass; + } else if (VT == MVT::f32) { + SrcReg = Registers[3][FPRIdx++]; + RC = &AArch64::FPR32RegClass; + } else if ((VT == MVT::f64) || VT.is64BitVector()) { + SrcReg = Registers[4][FPRIdx++]; + RC = &AArch64::FPR64RegClass; + } else if (VT.is128BitVector()) { + SrcReg = Registers[5][FPRIdx++]; + RC = &AArch64::FPR128RegClass; + } else + llvm_unreachable("Unexpected value type."); + + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + updateValueMap(&Arg, ResultReg); + } + return true; +} + +bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, + SmallVectorImpl<MVT> &OutVTs, + unsigned &NumBytes) { + CallingConv::ID CC = CLI.CallConv; + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC)); + + // Get a count of how many bytes are to be pushed on the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) + .addImm(NumBytes); + + // Process the args. + for (CCValAssign &VA : ArgLocs) { + const Value *ArgVal = CLI.OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + unsigned ArgReg = getRegForValue(ArgVal); + if (!ArgReg) + return false; + + // Handle arg promotion: SExt, ZExt, AExt. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + MVT SrcVT = ArgVT; + ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false); + if (!ArgReg) + return false; + break; + } + case CCValAssign::AExt: + // Intentional fall-through. + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + MVT SrcVT = ArgVT; + ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true); + if (!ArgReg) + return false; + break; + } + default: + llvm_unreachable("Unknown arg promotion!"); + } + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + CLI.OutRegs.push_back(VA.getLocReg()); + } else if (VA.needsCustom()) { + // FIXME: Handle custom args. + return false; + } else { + assert(VA.isMemLoc() && "Assuming store on stack."); + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + + // Need to store on the stack. + unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8; + + unsigned BEAlign = 0; + if (ArgSize < 8 && !Subtarget->isLittleEndian()) + BEAlign = 8 - ArgSize; + + Address Addr; + Addr.setKind(Address::RegBase); + Addr.setReg(AArch64::SP); + Addr.setOffset(VA.getLocMemOffset() + BEAlign); + + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + + if (!emitStore(ArgVT, ArgReg, Addr, MMO)) + return false; + } + } + return true; +} + +bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, + unsigned NumBytes) { + CallingConv::ID CC = CLI.CallConv; + + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(0); + + // Now the return value. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC)); + + // Only handle a single return value. + if (RVLocs.size() != 1) + return false; + + // Copy all of the result registers out of their specified physreg. + MVT CopyVT = RVLocs[0].getValVT(); + + // TODO: Handle big-endian results + if (CopyVT.isVector() && !Subtarget->isLittleEndian()) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(RVLocs[0].getLocReg()); + CLI.InRegs.push_back(RVLocs[0].getLocReg()); + + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = 1; + } + + return true; +} + +bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { + CallingConv::ID CC = CLI.CallConv; + bool IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + MCSymbol *Symbol = CLI.Symbol; + + if (!Callee && !Symbol) + return false; + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) + return false; + + CodeModel::Model CM = TM.getCodeModel(); + // Only support the small and large code model. + if (CM != CodeModel::Small && CM != CodeModel::Large) + return false; + + // FIXME: Add large code model support for ELF. + if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) + return false; + + // Let SDISel handle vararg functions. + if (IsVarArg) + return false; + + // FIXME: Only handle *simple* calls for now. + MVT RetVT; + if (CLI.RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(CLI.RetTy, RetVT)) + return false; + + for (auto Flag : CLI.OutFlags) + if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal()) + return false; + + // Set up the argument vectors. + SmallVector<MVT, 16> OutVTs; + OutVTs.reserve(CLI.OutVals.size()); + + for (auto *Val : CLI.OutVals) { + MVT VT; + if (!isTypeLegal(Val->getType(), VT) && + !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) + return false; + + // We don't handle vector parameters yet. + if (VT.isVector() || VT.getSizeInBits() > 64) + return false; + + OutVTs.push_back(VT); + } + + Address Addr; + if (Callee && !computeCallAddress(Callee, Addr)) + return false; + + // Handle the arguments now that we've gotten them. + unsigned NumBytes; + if (!processCallArgs(CLI, OutVTs, NumBytes)) + return false; + + // Issue the call. + MachineInstrBuilder MIB; + if (CM == CodeModel::Small) { + const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); + if (Symbol) + MIB.addSym(Symbol, 0); + else if (Addr.getGlobalValue()) + MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0); + else if (Addr.getReg()) { + unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0); + MIB.addReg(Reg); + } else + return false; + } else { + unsigned CallReg = 0; + if (Symbol) { + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg) + .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE); + + CallReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::LDRXui), CallReg) + .addReg(ADRPReg) + .addSym(Symbol, + AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } else if (Addr.getGlobalValue()) + CallReg = materializeGV(Addr.getGlobalValue()); + else if (Addr.getReg()) + CallReg = Addr.getReg(); + + if (!CallReg) + return false; + + const MCInstrDesc &II = TII.get(AArch64::BLR); + CallReg = constrainOperandRegClass(II, CallReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg); + } + + // Add implicit physical register uses to the call. + for (auto Reg : CLI.OutRegs) + MIB.addReg(Reg, RegState::Implicit); + + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); + + CLI.Call = MIB; + + // Finish off the call including any return values. + return finishCall(CLI, RetVT, NumBytes); +} + +bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) { + if (Alignment) + return Len / Alignment <= 4; + else + return Len < 32; +} + +bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src, + uint64_t Len, unsigned Alignment) { + // Make sure we don't bloat code by inlining very large memcpy's. + if (!isMemCpySmall(Len, Alignment)) + return false; + + int64_t UnscaledOffset = 0; + Address OrigDest = Dest; + Address OrigSrc = Src; + + while (Len) { + MVT VT; + if (!Alignment || Alignment >= 8) { + if (Len >= 8) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + } else { + // Bound based on alignment. + if (Len >= 4 && Alignment == 4) + VT = MVT::i32; + else if (Len >= 2 && Alignment == 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + } + + unsigned ResultReg = emitLoad(VT, VT, Src); + if (!ResultReg) + return false; + + if (!emitStore(VT, ResultReg, Dest)) + return false; + + int64_t Size = VT.getSizeInBits() / 8; + Len -= Size; + UnscaledOffset += Size; + + // We need to recompute the unscaled offset for each iteration. + Dest.setOffset(OrigDest.getOffset() + UnscaledOffset); + Src.setOffset(OrigSrc.getOffset() + UnscaledOffset); + } + + return true; +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, + const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + Intrinsic::ID IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::sadd_with_overflow; + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::uadd_with_overflow; + break; + } + + AArch64CC::CondCode TmpCC; + switch (IID) { + default: + return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + TmpCC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + TmpCC = AArch64CC::HS; + break; + case Intrinsic::usub_with_overflow: + TmpCC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + TmpCC = AArch64CC::NE; + break; + } + + // Check if both instructions are in the same basic block. + if (!isValueAvailable(II)) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + +bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { + // FIXME: Handle more intrinsics. + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::frameaddress: { + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const AArch64RegisterInfo *RegInfo = + static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo()); + unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr); + // Recursively load frame address + // ldr x0, [fp] + // ldr x0, [x0] + // ldr x0, [x0] + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass, + SrcReg, /*IsKill=*/true, 0); + assert(DestReg && "Unexpected LDR instruction emission failure."); + SrcReg = DestReg; + } + + updateValueMap(II, SrcReg); + return true; + } + case Intrinsic::memcpy: + case Intrinsic::memmove: { + const auto *MTI = cast<MemTransferInst>(II); + // Don't handle volatile. + if (MTI->isVolatile()) + return false; + + // Disable inlining for memmove before calls to ComputeAddress. Otherwise, + // we would emit dead code because we don't currently handle memmoves. + bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy); + if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) { + // Small memcpy's are common enough that we want to do them without a call + // if possible. + uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue(); + unsigned Alignment = MTI->getAlignment(); + if (isMemCpySmall(Len, Alignment)) { + Address Dest, Src; + if (!computeAddress(MTI->getRawDest(), Dest) || + !computeAddress(MTI->getRawSource(), Src)) + return false; + if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment)) + return true; + } + } + + if (!MTI->getLength()->getType()->isIntegerTy(64)) + return false; + + if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove"; + return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2); + } + case Intrinsic::memset: { + const MemSetInst *MSI = cast<MemSetInst>(II); + // Don't handle volatile. + if (MSI->isVolatile()) + return false; + + if (!MSI->getLength()->getType()->isIntegerTy(64)) + return false; + + if (MSI->getDestAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); + } + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: { + MVT RetVT; + if (!isTypeLegal(II->getType(), RetVT)) + return false; + + if (RetVT != MVT::f32 && RetVT != MVT::f64) + return false; + + static const RTLIB::Libcall LibCallTable[3][2] = { + { RTLIB::SIN_F32, RTLIB::SIN_F64 }, + { RTLIB::COS_F32, RTLIB::COS_F64 }, + { RTLIB::POW_F32, RTLIB::POW_F64 } + }; + RTLIB::Libcall LC; + bool Is64Bit = RetVT == MVT::f64; + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::sin: + LC = LibCallTable[0][Is64Bit]; + break; + case Intrinsic::cos: + LC = LibCallTable[1][Is64Bit]; + break; + case Intrinsic::pow: + LC = LibCallTable[2][Is64Bit]; + break; + } + + ArgListTy Args; + Args.reserve(II->getNumArgOperands()); + + // Populate the argument list. + for (auto &Arg : II->arg_operands()) { + ArgListEntry Entry; + Entry.Val = Arg; + Entry.Ty = Arg->getType(); + Args.push_back(Entry); + } + + CallLoweringInfo CLI; + MCContext &Ctx = MF->getContext(); + CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), II->getType(), + TLI.getLibcallName(LC), std::move(Args)); + if (!lowerCallTo(CLI)) + return false; + updateValueMap(II, CLI.ResultReg); + return true; + } + case Intrinsic::fabs: { + MVT VT; + if (!isTypeLegal(II->getType(), VT)) + return false; + + unsigned Opc; + switch (VT.SimpleTy) { + default: + return false; + case MVT::f32: + Opc = AArch64::FABSSr; + break; + case MVT::f64: + Opc = AArch64::FABSDr; + break; + } + unsigned SrcReg = getRegForValue(II->getOperand(0)); + if (!SrcReg) + return false; + bool SrcRegIsKill = hasTrivialKill(II->getOperand(0)); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg, getKillRegState(SrcRegIsKill)); + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) + .addImm(1); + return true; + } + case Intrinsic::sqrt: { + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Op0Reg = getRegForValue(II->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(II->getOperand(0)); + + unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill); + if (!ResultReg) + return false; + + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics. + const Function *Callee = II->getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + Intrinsic::ID IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::sadd_with_overflow; + RHS = LHS; + } + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::uadd_with_overflow; + RHS = LHS; + } + break; + } + + unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0; + AArch64CC::CondCode CC = AArch64CC::Invalid; + switch (IID) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::HS; + break; + case Intrinsic::ssub_with_overflow: + ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::VS; + break; + case Intrinsic::usub_with_overflow: + ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: { + CC = AArch64CC::NE; + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + if (VT == MVT::i32) { + MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg, + /*IsKill=*/false, 32); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, + AArch64::sub_32); + ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true, + AArch64::sub_32); + emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, + AArch64_AM::ASR, 31, /*WantResult=*/false); + } else { + assert(VT == MVT::i64 && "Unexpected value type."); + // LHSReg and RHSReg cannot be killed by this Mul, since they are + // reused in the next instruction. + MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg, + /*IsKill=*/false); + unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, + AArch64_AM::ASR, 63, /*WantResult=*/false); + } + break; + } + case Intrinsic::umul_with_overflow: { + CC = AArch64CC::NE; + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + if (VT == MVT::i32) { + MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg, + /*IsKill=*/false, AArch64_AM::LSR, 32, + /*WantResult=*/false); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, + AArch64::sub_32); + } else { + assert(VT == MVT::i64 && "Unexpected value type."); + // LHSReg and RHSReg cannot be killed by this Mul, since they are + // reused in the next instruction. + MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg, + /*IsKill=*/false); + unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg, + /*IsKill=*/false, /*WantResult=*/false); + } + break; + } + } + + if (MulReg) { + ResultReg1 = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg); + } + + ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass, + AArch64::WZR, /*IsKill=*/true, AArch64::WZR, + /*IsKill=*/true, getInvertedCondCode(CC)); + (void)ResultReg2; + assert((ResultReg1 + 1) == ResultReg2 && + "Nonconsecutive result registers."); + updateValueMap(II, ResultReg1, 2); + return true; + } + } + return false; +} + +bool AArch64FastISel::selectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + if (F.isVarArg()) + return false; + + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + + if (Ret->getNumOperands() > 0) { + CallingConv::ID CC = F.getCallingConv(); + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); + CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + CCInfo.AnalyzeReturn(Outs, RetCC); + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + const Value *RV = Ret->getOperand(0); + + // Don't bother handling odd stuff for now. + if ((VA.getLocInfo() != CCValAssign::Full) && + (VA.getLocInfo() != CCValAssign::BCvt)) + return false; + + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + unsigned DestReg = VA.getLocReg(); + // Avoid a cross-class copy. This is very unlikely. + if (!MRI.getRegClass(SrcReg)->contains(DestReg)) + return false; + + EVT RVEVT = TLI.getValueType(DL, RV->getType()); + if (!RVEVT.isSimple()) + return false; + + // Vectors (of > 1 lane) in big endian need tricky handling. + if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 && + !Subtarget->isLittleEndian()) + return false; + + MVT RVVT = RVEVT.getSimpleVT(); + if (RVVT == MVT::f128) + return false; + + MVT DestVT = VA.getValVT(); + // Special handling for extended integers. + if (RVVT != DestVT) { + if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + bool IsZExt = Outs[0].Flags.isZExt(); + SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt); + if (SrcReg == 0) + return false; + } + + // Make the copy. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::RET_ReallyLR)); + for (unsigned RetReg : RetRegs) + MIB.addReg(RetReg, RegState::Implicit); + return true; +} + +bool AArch64FastISel::selectTrunc(const Instruction *I) { + Type *DestTy = I->getType(); + Value *Op = I->getOperand(0); + Type *SrcTy = Op->getType(); + + EVT SrcEVT = TLI.getValueType(DL, SrcTy, true); + EVT DestEVT = TLI.getValueType(DL, DestTy, true); + if (!SrcEVT.isSimple()) + return false; + if (!DestEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + + if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && + SrcVT != MVT::i8) + return false; + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 && + DestVT != MVT::i1) + return false; + + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(Op); + + // If we're truncating from i64 to a smaller non-legal type then generate an + // AND. Otherwise, we know the high bits are undefined and a truncate only + // generate a COPY. We cannot mark the source register also as result + // register, because this can incorrectly transfer the kill flag onto the + // source register. + unsigned ResultReg; + if (SrcVT == MVT::i64) { + uint64_t Mask = 0; + switch (DestVT.SimpleTy) { + default: + // Trunc i64 to i32 is handled by the target-independent fast-isel. + return false; + case MVT::i1: + Mask = 0x1; + break; + case MVT::i8: + Mask = 0xff; + break; + case MVT::i16: + Mask = 0xffff; + break; + } + // Issue an extract_subreg to get the lower 32-bits. + unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + AArch64::sub_32); + // Create the AND instruction which performs the actual truncation. + ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask); + assert(ResultReg && "Unexpected AND instruction emission failure."); + } else { + ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + } + + updateValueMap(I, ResultReg); + return true; +} + +unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { + assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 || + DestVT == MVT::i64) && + "Unexpected value type."); + // Handle i8 and i16 as i32. + if (DestVT == MVT::i8 || DestVT == MVT::i16) + DestVT = MVT::i32; + + if (IsZExt) { + unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + assert(ResultReg && "Unexpected AND instruction emission failure."); + if (DestVT == MVT::i64) { + // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the + // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. + unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; + } else { + if (DestVT == MVT::i64) { + // FIXME: We're SExt i1 to i64. + return 0; + } + return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg, + /*TODO:IsKill=*/false, 0, 0); + } +} + +unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + unsigned Opc, ZReg; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: + case MVT::i16: + case MVT::i32: + RetVT = MVT::i32; + Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break; + case MVT::i64: + Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill, + /*IsKill=*/ZReg, true); +} + +unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + if (RetVT != MVT::i64) + return 0; + + return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass, + Op0, Op0IsKill, Op1, Op1IsKill, + AArch64::XZR, /*IsKill=*/true); +} + +unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + if (RetVT != MVT::i64) + return 0; + + return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass, + Op0, Op0IsKill, Op1, Op1IsKill, + AArch64::XZR, /*IsKill=*/true); +} + +unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::LSLVWr; break; + case MVT::i64: Opc = AArch64::LSLVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<32+s-r,32-r> = Wn<s:0> when r > s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 4 + // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7 + // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext + // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext + // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 8 + // Wd<32+7-24,32-24> = Wn<7:0> + // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext + // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext + // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 12 + // Wd<32+3-20,32-20> = Wn<3:0> + // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext + // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext + + unsigned ImmR = RegSize - Shift; + // Limit the width to the length of the source type. + unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift); + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) + .addImm(0) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; + } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); +} + +unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::LSRVWr; break; + case MVT::i64: Opc = AArch64::LSRVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op0IsKill = Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<s-r:0> = Wn<s:r> when r <= s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 4 + // Wd<7-4:0> = Wn<7:4> + // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext + // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 8 + // Wd<7-7,0> = Wn<7:7> + // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 12 + // Wd<7-7,0> = Wn<7:7> <- clamp r to 7 + // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + if (Shift >= SrcBits && IsZExt) + return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT); + + // It is not possible to fold a sign-extend into the LShr instruction. In this + // case emit a sign-extend. + if (!IsZExt) { + Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt); + if (!Op0) + return 0; + Op0IsKill = true; + SrcVT = RetVT; + SrcBits = SrcVT.getSizeInBits(); + IsZExt = true; + } + + unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift); + unsigned ImmS = SrcBits - 1; + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) + .addImm(0) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; + } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); +} + +unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::ASRVWr; break; + case MVT::i64: Opc = AArch64::ASRVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op0IsKill = Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<s-r:0> = Wn<s:r> when r <= s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 4 + // Wd<7-4:0> = Wn<7:4> + // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext + // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 8 + // Wd<7-7,0> = Wn<7:7> + // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 12 + // Wd<7-7,0> = Wn<7:7> <- clamp r to 7 + // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + if (Shift >= SrcBits && IsZExt) + return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT); + + unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift); + unsigned ImmS = SrcBits - 1; + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) + .addImm(0) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; + } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); +} + +unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + bool IsZExt) { + assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); + + // FastISel does not have plumbing to deal with extensions where the SrcVT or + // DestVT are odd things, so test to make sure that they are both types we can + // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise + // bail out to SelectionDAG. + if (((DestVT != MVT::i8) && (DestVT != MVT::i16) && + (DestVT != MVT::i32) && (DestVT != MVT::i64)) || + ((SrcVT != MVT::i1) && (SrcVT != MVT::i8) && + (SrcVT != MVT::i16) && (SrcVT != MVT::i32))) + return 0; + + unsigned Opc; + unsigned Imm = 0; + + switch (SrcVT.SimpleTy) { + default: + return 0; + case MVT::i1: + return emiti1Ext(SrcReg, DestVT, IsZExt); + case MVT::i8: + if (DestVT == MVT::i64) + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + else + Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Imm = 7; + break; + case MVT::i16: + if (DestVT == MVT::i64) + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + else + Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Imm = 15; + break; + case MVT::i32: + assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?"); + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + Imm = 31; + break; + } + + // Handle i8 and i16 as i32. + if (DestVT == MVT::i8 || DestVT == MVT::i16) + DestVT = MVT::i32; + else if (DestVT == MVT::i64) { + unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Src64) + .addImm(0) + .addReg(SrcReg) + .addImm(AArch64::sub_32); + SrcReg = Src64; + } + + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm); +} + +static bool isZExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRBBroX: + case AArch64::LDRHHroX: + case AArch64::LDRWroX: + case AArch64::LDRBBroW: + case AArch64::LDRHHroW: + case AArch64::LDRWroW: + return true; + } +} + +static bool isSExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRSBWroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + return true; + } +} + +bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, + MVT SrcVT) { + const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)); + if (!LI || !LI->hasOneUse()) + return false; + + // Check if the load instruction has already been selected. + unsigned Reg = lookUpRegForValue(LI); + if (!Reg) + return false; + + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) + return false; + + // Check if the correct load instruction has been emitted - SelectionDAG might + // have emitted a zero-extending load, but we need a sign-extending load. + bool IsZExt = isa<ZExtInst>(I); + const auto *LoadMI = MI; + if (LoadMI->getOpcode() == TargetOpcode::COPY && + LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { + unsigned LoadReg = MI->getOperand(1).getReg(); + LoadMI = MRI.getUniqueVRegDef(LoadReg); + assert(LoadMI && "Expected valid instruction"); + } + if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI))) + return false; + + // Nothing to be done. + if (RetVT != MVT::i64 || SrcVT > MVT::i32) { + updateValueMap(I, Reg); + return true; + } + + if (IsZExt) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(Reg, getKillRegState(true)) + .addImm(AArch64::sub_32); + Reg = Reg64; + } else { + assert((MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getSubReg() == AArch64::sub_32) && + "Expected copy instruction"); + Reg = MI->getOperand(1).getReg(); + MI->eraseFromParent(); + } + updateValueMap(I, Reg); + return true; +} + +bool AArch64FastISel::selectIntExt(const Instruction *I) { + assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) && + "Unexpected integer extend instruction."); + MVT RetVT; + MVT SrcVT; + if (!isTypeSupported(I->getType(), RetVT)) + return false; + + if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT)) + return false; + + // Try to optimize already sign-/zero-extended values from load instructions. + if (optimizeIntExtLoad(I, RetVT, SrcVT)) + return true; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); + + // Try to optimize already sign-/zero-extended values from function arguments. + bool IsZExt = isa<ZExtInst>(I); + if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) { + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) { + if (RetVT == MVT::i64 && SrcVT != MVT::i64) { + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), ResultReg) + .addImm(0) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(AArch64::sub_32); + SrcReg = ResultReg; + } + // Conservatively clear all kill flags from all uses, because we are + // replacing a sign-/zero-extend instruction at IR level with a nop at MI + // level. The result of the instruction at IR level might have been + // trivially dead, which is now not longer true. + unsigned UseReg = lookUpRegForValue(I); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; + } + } + + unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt); + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { + EVT DestEVT = TLI.getValueType(DL, I->getType(), true); + if (!DestEVT.isSimple()) + return false; + + MVT DestVT = DestEVT.getSimpleVT(); + if (DestVT != MVT::i64 && DestVT != MVT::i32) + return false; + + unsigned DivOpc; + bool Is64bit = (DestVT == MVT::i64); + switch (ISDOpcode) { + default: + return false; + case ISD::SREM: + DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr; + break; + case ISD::UREM: + DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr; + break; + } + unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr; + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + unsigned Src1Reg = getRegForValue(I->getOperand(1)); + if (!Src1Reg) + return false; + bool Src1IsKill = hasTrivialKill(I->getOperand(1)); + + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false, + Src1Reg, /*IsKill=*/false); + assert(QuotReg && "Unexpected DIV instruction emission failure."); + // The remainder is computed as numerator - (quotient * denominator) using the + // MSUB instruction. + unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true, + Src1Reg, Src1IsKill, Src0Reg, + Src0IsKill); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectMul(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) + return false; + + if (VT.isVector()) + return selectBinaryOp(I, ISD::MUL); + + const Value *Src0 = I->getOperand(0); + const Value *Src1 = I->getOperand(1); + if (const auto *C = dyn_cast<ConstantInt>(Src0)) + if (C->getValue().isPowerOf2()) + std::swap(Src0, Src1); + + // Try to simplify to a shift instruction. + if (const auto *C = dyn_cast<ConstantInt>(Src1)) + if (C->getValue().isPowerOf2()) { + uint64_t ShiftVal = C->getValue().logBase2(); + MVT SrcVT = VT; + bool IsZExt = true; + if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) { + if (!isIntExtFree(ZExt)) { + MVT VT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = true; + Src0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) { + if (!isIntExtFree(SExt)) { + MVT VT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = false; + Src0 = SExt->getOperand(0); + } + } + } + + unsigned Src0Reg = getRegForValue(Src0); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(Src0); + + unsigned ResultReg = + emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt); + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + } + + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + unsigned Src1Reg = getRegForValue(I->getOperand(1)); + if (!Src1Reg) + return false; + bool Src1IsKill = hasTrivialKill(I->getOperand(1)); + + unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectShift(const Instruction *I) { + MVT RetVT; + if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true)) + return false; + + if (RetVT.isVector()) + return selectOperator(I, I->getOpcode()); + + if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) { + unsigned ResultReg = 0; + uint64_t ShiftVal = C->getZExtValue(); + MVT SrcVT = RetVT; + bool IsZExt = I->getOpcode() != Instruction::AShr; + const Value *Op0 = I->getOperand(0); + if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) { + if (!isIntExtFree(ZExt)) { + MVT TmpVT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = true; + Op0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) { + if (!isIntExtFree(SExt)) { + MVT TmpVT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = false; + Op0 = SExt->getOperand(0); + } + } + } + + unsigned Op0Reg = getRegForValue(Op0); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(Op0); + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected instruction."); + case Instruction::Shl: + ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + case Instruction::AShr: + ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + case Instruction::LShr: + ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; + } + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(I->getOperand(0)); + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (!Op1Reg) + return false; + bool Op1IsKill = hasTrivialKill(I->getOperand(1)); + + unsigned ResultReg = 0; + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected instruction."); + case Instruction::Shl: + ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + case Instruction::AShr: + ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + case Instruction::LShr: + ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + } + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectBitCast(const Instruction *I) { + MVT RetVT, SrcVT; + + if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT)) + return false; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + unsigned Opc; + if (RetVT == MVT::f32 && SrcVT == MVT::i32) + Opc = AArch64::FMOVWSr; + else if (RetVT == MVT::f64 && SrcVT == MVT::i64) + Opc = AArch64::FMOVXDr; + else if (RetVT == MVT::i32 && SrcVT == MVT::f32) + Opc = AArch64::FMOVSWr; + else if (RetVT == MVT::i64 && SrcVT == MVT::f64) + Opc = AArch64::FMOVDXr; + else + return false; + + const TargetRegisterClass *RC = nullptr; + switch (RetVT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: RC = &AArch64::GPR32RegClass; break; + case MVT::i64: RC = &AArch64::GPR64RegClass; break; + case MVT::f32: RC = &AArch64::FPR32RegClass; break; + case MVT::f64: RC = &AArch64::FPR64RegClass; break; + } + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(I->getOperand(0)); + unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectFRem(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + RTLIB::Libcall LC; + switch (RetVT.SimpleTy) { + default: + return false; + case MVT::f32: + LC = RTLIB::REM_F32; + break; + case MVT::f64: + LC = RTLIB::REM_F64; + break; + } + + ArgListTy Args; + Args.reserve(I->getNumOperands()); + + // Populate the argument list. + for (auto &Arg : I->operands()) { + ArgListEntry Entry; + Entry.Val = Arg; + Entry.Ty = Arg->getType(); + Args.push_back(Entry); + } + + CallLoweringInfo CLI; + MCContext &Ctx = MF->getContext(); + CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), I->getType(), + TLI.getLibcallName(LC), std::move(Args)); + if (!lowerCallTo(CLI)) + return false; + updateValueMap(I, CLI.ResultReg); + return true; +} + +bool AArch64FastISel::selectSDiv(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + if (!isa<ConstantInt>(I->getOperand(1))) + return selectBinaryOp(I, ISD::SDIV); + + const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || !C || + !(C.isPowerOf2() || (-C).isPowerOf2())) + return selectBinaryOp(I, ISD::SDIV); + + unsigned Lg2 = C.countTrailingZeros(); + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + if (cast<BinaryOperator>(I)->isExact()) { + unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2); + if (!ResultReg) + return false; + updateValueMap(I, ResultReg); + return true; + } + + int64_t Pow2MinusOne = (1ULL << Lg2) - 1; + unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne); + if (!AddReg) + return false; + + // (Src0 < 0) ? Pow2 - 1 : 0; + if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0)) + return false; + + unsigned SelectOpc; + const TargetRegisterClass *RC; + if (VT == MVT::i64) { + SelectOpc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + } else { + SelectOpc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + } + unsigned SelectReg = + fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg, + Src0IsKill, AArch64CC::LT); + if (!SelectReg) + return false; + + // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also + // negate the result. + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg; + if (C.isNegative()) + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true, + SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2); + else + ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We +/// have to duplicate it for AArch64, because otherwise we would fail during the +/// sign-extend emission. +std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) { + unsigned IdxN = getRegForValue(Idx); + if (IdxN == 0) + // Unhandled operand. Halt "fast" selection and bail. + return std::pair<unsigned, bool>(0, false); + + bool IdxNIsKill = hasTrivialKill(Idx); + + // If the index is smaller or larger than intptr_t, truncate or extend it. + MVT PtrVT = TLI.getPointerTy(DL); + EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); + if (IdxVT.bitsLT(PtrVT)) { + IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false); + IdxNIsKill = true; + } else if (IdxVT.bitsGT(PtrVT)) + llvm_unreachable("AArch64 FastISel doesn't support types larger than i64"); + return std::pair<unsigned, bool>(IdxN, IdxNIsKill); +} + +/// This is mostly a copy of the existing FastISel GEP code, but we have to +/// duplicate it for AArch64, because otherwise we would bail out even for +/// simple cases. This is because the standard fastEmit functions don't cover +/// MUL at all and ADD is lowered very inefficientily. +bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { + unsigned N = getRegForValue(I->getOperand(0)); + if (!N) + return false; + bool NIsKill = hasTrivialKill(I->getOperand(0)); + + // Keep a running tab of the total offset to coalesce multiple N = N + Offset + // into a single N = N + TotalOffset. + uint64_t TotalOffs = 0; + Type *Ty = I->getOperand(0)->getType(); + MVT VT = TLI.getPointerTy(DL); + for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { + const Value *Idx = *OI; + if (auto *StTy = dyn_cast<StructType>(Ty)) { + unsigned Field = cast<ConstantInt>(Idx)->getZExtValue(); + // N = N + Offset + if (Field) + TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); + Ty = StTy->getElementType(Field); + } else { + Ty = cast<SequentialType>(Ty)->getElementType(); + // If this is a constant subscript, handle it quickly. + if (const auto *CI = dyn_cast<ConstantInt>(Idx)) { + if (CI->isZero()) + continue; + // N = N + Offset + TotalOffs += + DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue(); + continue; + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + NIsKill = true; + TotalOffs = 0; + } + + // N = N + Idx * ElementSize; + uint64_t ElementSize = DL.getTypeAllocSize(Ty); + std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx); + unsigned IdxN = Pair.first; + bool IdxNIsKill = Pair.second; + if (!IdxN) + return false; + + if (ElementSize != 1) { + unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize); + if (!C) + return false; + IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true); + if (!IdxN) + return false; + IdxNIsKill = true; + } + N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill); + if (!N) + return false; + } + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + } + updateValueMap(I, N); + return true; +} + +bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: + break; + case Instruction::Add: + case Instruction::Sub: + return selectAddSub(I); + case Instruction::Mul: + return selectMul(I); + case Instruction::SDiv: + return selectSDiv(I); + case Instruction::SRem: + if (!selectBinaryOp(I, ISD::SREM)) + return selectRem(I, ISD::SREM); + return true; + case Instruction::URem: + if (!selectBinaryOp(I, ISD::UREM)) + return selectRem(I, ISD::UREM); + return true; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return selectShift(I); + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return selectLogicalOp(I); + case Instruction::Br: + return selectBranch(I); + case Instruction::IndirectBr: + return selectIndirectBr(I); + case Instruction::BitCast: + if (!FastISel::selectBitCast(I)) + return selectBitCast(I); + return true; + case Instruction::FPToSI: + if (!selectCast(I, ISD::FP_TO_SINT)) + return selectFPToInt(I, /*Signed=*/true); + return true; + case Instruction::FPToUI: + return selectFPToInt(I, /*Signed=*/false); + case Instruction::ZExt: + case Instruction::SExt: + return selectIntExt(I); + case Instruction::Trunc: + if (!selectCast(I, ISD::TRUNCATE)) + return selectTrunc(I); + return true; + case Instruction::FPExt: + return selectFPExt(I); + case Instruction::FPTrunc: + return selectFPTrunc(I); + case Instruction::SIToFP: + if (!selectCast(I, ISD::SINT_TO_FP)) + return selectIntToFP(I, /*Signed=*/true); + return true; + case Instruction::UIToFP: + return selectIntToFP(I, /*Signed=*/false); + case Instruction::Load: + return selectLoad(I); + case Instruction::Store: + return selectStore(I); + case Instruction::FCmp: + case Instruction::ICmp: + return selectCmp(I); + case Instruction::Select: + return selectSelect(I); + case Instruction::Ret: + return selectRet(I); + case Instruction::FRem: + return selectFRem(I); + case Instruction::GetElementPtr: + return selectGetElementPtr(I); + } + + // fall-back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); + // Silence warnings. + (void)&CC_AArch64_DarwinPCS_VarArg; +} + +namespace llvm { +llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + return new AArch64FastISel(FuncInfo, LibInfo); +} +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp new file mode 100644 index 0000000..11ae800 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -0,0 +1,1012 @@ +//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of TargetFrameLowering class. +// +// On AArch64, stack frames are structured as follows: +// +// The stack grows downward. +// +// All of the individual frame areas on the frame below are optional, i.e. it's +// possible to create a function so that the particular area isn't present +// in the frame. +// +// At function entry, the "frame" looks as follows: +// +// | | Higher address +// |-----------------------------------| +// | | +// | arguments passed on the stack | +// | | +// |-----------------------------------| <- sp +// | | Lower address +// +// +// After the prologue has run, the frame has the following general structure. +// Note that this doesn't depict the case where a red-zone is used. Also, +// technically the last frame area (VLAs) doesn't get created until in the +// main function body, after the prologue is run. However, it's depicted here +// for completeness. +// +// | | Higher address +// |-----------------------------------| +// | | +// | arguments passed on the stack | +// | | +// |-----------------------------------| +// | | +// | prev_fp, prev_lr | +// | (a.k.a. "frame record") | +// |-----------------------------------| <- fp(=x29) +// | | +// | other callee-saved registers | +// | | +// |-----------------------------------| +// |.empty.space.to.make.part.below....| +// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at +// |.the.standard.16-byte.alignment....| compile time; if present) +// |-----------------------------------| +// | | +// | local variables of fixed size | +// | including spill slots | +// |-----------------------------------| <- bp(not defined by ABI, +// |.variable-sized.local.variables....| LLVM chooses X19) +// |.(VLAs)............................| (size of this area is unknown at +// |...................................| compile time) +// |-----------------------------------| <- sp +// | | Lower address +// +// +// To access the data in a frame, at-compile time, a constant offset must be +// computable from one of the pointers (fp, bp, sp) to access it. The size +// of the areas with a dotted background cannot be computed at compile-time +// if they are present, making it required to have all three of fp, bp and +// sp to be set up to be able to access all contents in the frame areas, +// assuming all of the frame areas are non-empty. +// +// For most functions, some of the frame areas are empty. For those functions, +// it may not be necessary to set up fp or bp: +// * A base pointer is definitely needed when there are both VLAs and local +// variables with more-than-default alignment requirements. +// * A frame pointer is definitely needed when there are local variables with +// more-than-default alignment requirements. +// +// In some cases when a base pointer is not strictly needed, it is generated +// anyway when offsets from the frame pointer to access local variables become +// so large that the offset can't be encoded in the immediate fields of loads +// or stores. +// +// FIXME: also explain the redzone concept. +// FIXME: also explain the concept of reserved call frames. +// +//===----------------------------------------------------------------------===// + +#include "AArch64FrameLowering.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "frame-info" + +static cl::opt<bool> EnableRedZone("aarch64-redzone", + cl::desc("enable use of redzone on AArch64"), + cl::init(false), cl::Hidden); + +STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); + +bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { + if (!EnableRedZone) + return false; + // Don't use the red zone if the function explicitly asks us not to. + // This is typically used for kernel code. + if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone)) + return false; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + unsigned NumBytes = AFI->getLocalStackSize(); + + // Note: currently hasFP() is always true for hasCalls(), but that's an + // implementation detail of the current code, not a strict requirement, + // so stay safe here and check both. + if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) + return false; + return true; +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. +bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + return (MFI->hasCalls() || MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || MFI->hasStackMap() || + MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF)); +} + +/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +/// not required, we reserve argument space for call sites in the function +/// immediately on entry to the current function. This eliminates the need for +/// add/sub sp brackets around call sites. Returns true if the call frame is +/// included as part of the stack frame. +bool +AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +void AArch64FrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const AArch64InstrInfo *TII = + static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + DebugLoc DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { + unsigned Align = getStackAlignment(); + + int64_t Amount = I->getOperand(0).getImm(); + Amount = RoundUpToAlignment(Amount, Align); + if (!IsDestroy) + Amount = -Amount; + + // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it + // doesn't have to pop anything), then the first operand will be zero too so + // this adjustment is a no-op. + if (CalleePopAmount == 0) { + // FIXME: in-function stack adjustment for calls is limited to 24-bits + // because there's no guaranteed temporary register available. + // + // ADD/SUB (immediate) has only LSL #0 and LSL #12 available. + // 1) For offset <= 12-bit, we use LSL #0 + // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses + // LSL #0, and the other uses LSL #12. + // + // Mostly call frames will be allocated at the start of a function so + // this is OK, but it is a limitation that needs dealing with. + assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); + } + } else if (CalleePopAmount != 0) { + // If the calling convention demands that the callee pops arguments from the + // stack, we want to add it back if we have a reserved call frame. + assert(CalleePopAmount < 0xffffff && "call frame too large"); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, + TII); + } + MBB.erase(I); +} + +void AArch64FrameLowering::emitCalleeSavedFrameMoves( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned FramePtr) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) + return; + + const DataLayout &TD = MF.getDataLayout(); + bool HasFP = hasFP(MF); + + // Calculate amount of bytes used for return address storing. + int stackGrowth = -TD.getPointerSize(0); + + // Calculate offsets. + int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; + unsigned TotalSkipped = 0; + for (const auto &Info : CSI) { + unsigned Reg = Info.getReg(); + int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - + getOffsetOfLocalArea() + saveAreaOffset; + + // Don't output a new CFI directive if we're re-saving the frame pointer or + // link register. This happens when the PrologEpilogInserter has inserted an + // extra "STP" of the frame pointer and link register -- the "emitPrologue" + // method automatically generates the directives when frame pointers are + // used. If we generate CFI directives for the extra "STP"s, the linker will + // lose track of the correct values for the frame pointer and link register. + if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { + TotalSkipped += stackGrowth; + continue; + } + + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, DwarfReg, Offset - TotalSkipped)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } +} + +/// Get FPOffset by analyzing the first instruction. +static int getFPOffsetInPrologue(MachineInstr *MBBI) { + // First instruction must a) allocate the stack and b) have an immediate + // that is a multiple of -2. + assert(((MBBI->getOpcode() == AArch64::STPXpre || + MBBI->getOpcode() == AArch64::STPDpre) && + MBBI->getOperand(3).getReg() == AArch64::SP && + MBBI->getOperand(4).getImm() < 0 && + (MBBI->getOperand(4).getImm() & 1) == 0)); + + // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space + // required for the callee saved register area we get the frame pointer + // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. + int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; + assert(FPOffset >= 0 && "Bad Framepointer Offset"); + return FPOffset; +} + +static bool isCSSave(MachineInstr *MBBI) { + return MBBI->getOpcode() == AArch64::STPXi || + MBBI->getOpcode() == AArch64::STPDi || + MBBI->getOpcode() == AArch64::STPXpre || + MBBI->getOpcode() == AArch64::STPDpre; +} + +void AArch64FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.begin(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); + bool HasFP = hasFP(MF); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + int NumBytes = (int)MFI->getStackSize(); + if (!AFI->hasStackFrame()) { + assert(!HasFP && "unexpected function without stack frame but with FP"); + + // All of the stack allocation is for locals. + AFI->setLocalStackSize(NumBytes); + + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); + + // REDZONE: If the stack size is less than 128 bytes, we don't need + // to actually allocate. + if (NumBytes && !canUseRedZone(MF)) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + + // Encode the stack size of the leaf function. + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } else if (NumBytes) { + ++NumRedZoneFunctions; + } + + return; + } + + // Only set up FP if we actually need to. + int FPOffset = 0; + if (HasFP) + FPOffset = getFPOffsetInPrologue(MBBI); + + // Move past the saves of the callee-saved registers. + while (isCSSave(MBBI)) { + ++MBBI; + NumBytes -= 16; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + if (HasFP) { + // Issue sub fp, sp, FPOffset or + // mov fp,sp when FPOffset is zero. + // Note: All stores of callee-saved registers are marked as "FrameSetup". + // This code marks the instruction(s) that set the FP also. + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, + MachineInstr::FrameSetup); + } + + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes); + + // Allocate space for the rest of the frame. + + const unsigned Alignment = MFI->getMaxAlignment(); + const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + unsigned scratchSPReg = AArch64::SP; + if (NumBytes && NeedsRealignment) { + // Use the first callee-saved register as a scratch register. + scratchSPReg = AArch64::X9; + } + + // If we're a leaf function, try using the red zone. + if (NumBytes && !canUseRedZone(MF)) + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + + if (NumBytes && NeedsRealignment) { + const unsigned NrBitsToZero = countTrailingZeros(Alignment); + assert(NrBitsToZero > 1); + assert(scratchSPReg != AArch64::SP); + + // SUB X9, SP, NumBytes + // -- X9 is temporary register, so shouldn't contain any live data here, + // -- free to use. This is already produced by emitFrameOffset above. + // AND SP, X9, 0b11111...0000 + // The logical immediates have a non-trivial encoding. The following + // formula computes the encoded immediate with all ones but + // NrBitsToZero zero bits as least significant bits. + uint32_t andMaskEncoded = + (1 <<12) // = N + | ((64-NrBitsToZero) << 6) // immr + | ((64-NrBitsToZero-1) << 0) // imms + ; + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) + .addReg(scratchSPReg, RegState::Kill) + .addImm(andMaskEncoded); + } + + // If we need a base pointer, set it up here. It's whatever the value of the + // stack pointer is at this point. Any variable size objects will be allocated + // after this, so we can still use the base pointer to reference locals. + // + // FIXME: Clarify FrameSetup flags here. + // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is + // needed. + if (RegInfo->hasBasePointer(MF)) { + TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, + false); + } + + if (needsFrameMoves) { + const DataLayout &TD = MF.getDataLayout(); + const int StackGrowth = -TD.getPointerSize(0); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + // An example of the prologue: + // + // .globl __foo + // .align 2 + // __foo: + // Ltmp0: + // .cfi_startproc + // .cfi_personality 155, ___gxx_personality_v0 + // Leh_func_begin: + // .cfi_lsda 16, Lexception33 + // + // stp xa,bx, [sp, -#offset]! + // ... + // stp x28, x27, [sp, #offset-32] + // stp fp, lr, [sp, #offset-16] + // add fp, sp, #offset - 16 + // sub sp, sp, #1360 + // + // The Stack: + // +-------------------------------------------+ + // 10000 | ........ | ........ | ........ | ........ | + // 10004 | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // 10008 | ........ | ........ | ........ | ........ | + // 1000c | ........ | ........ | ........ | ........ | + // +===========================================+ + // 10010 | X28 Register | + // 10014 | X28 Register | + // +-------------------------------------------+ + // 10018 | X27 Register | + // 1001c | X27 Register | + // +===========================================+ + // 10020 | Frame Pointer | + // 10024 | Frame Pointer | + // +-------------------------------------------+ + // 10028 | Link Register | + // 1002c | Link Register | + // +===========================================+ + // 10030 | ........ | ........ | ........ | ........ | + // 10034 | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // 10038 | ........ | ........ | ........ | ........ | + // 1003c | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // + // [sp] = 10030 :: >>initial value<< + // sp = 10020 :: stp fp, lr, [sp, #-16]! + // fp = sp == 10020 :: mov fp, sp + // [sp] == 10020 :: stp x28, x27, [sp, #-16]! + // sp == 10010 :: >>final value<< + // + // The frame pointer (w29) points to address 10020. If we use an offset of + // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 + // for w27, and -32 for w28: + // + // Ltmp1: + // .cfi_def_cfa w29, 16 + // Ltmp2: + // .cfi_offset w30, -8 + // Ltmp3: + // .cfi_offset w29, -16 + // Ltmp4: + // .cfi_offset w27, -24 + // Ltmp5: + // .cfi_offset w28, -32 + + if (HasFP) { + // Define the current CFA rule to use the provided FP. + unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + + // Record the location of the stored LR + unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + + // Record the location of the stored FP + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } else { + // Encode the stack size of the leaf function. + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + + // Now emit the moves for whatever callee saved regs we have. + emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); + } +} + +static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +/// Checks whether the given instruction restores callee save registers +/// and if so returns how many. +static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { + unsigned RtIdx = 0; + switch (MI.getOpcode()) { + case AArch64::LDPXpost: + case AArch64::LDPDpost: + RtIdx = 1; + // FALLTHROUGH + case AArch64::LDPXi: + case AArch64::LDPDi: + if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || + !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || + MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) + return 0; + return 2; + } + return 0; +} + +void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL; + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + DL = MBBI->getDebugLoc(); + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || + RetOpcode == AArch64::TCRETURNri; + } + int NumBytes = MFI->getStackSize(); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + // Initial and residual are named for consistency with the prologue. Note that + // in the epilogue, the residual adjustment is executed first. + uint64_t ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + // The stack frame should be like below, + // + // ---------------------- --- + // | | | + // | BytesInStackArgArea| CalleeArgStackSize + // | (NumReusableBytes) | (of tail call) + // | | --- + // | | | + // ---------------------| --- | + // | | | | + // | CalleeSavedReg | | | + // | (NumRestores * 8) | | | + // | | | | + // ---------------------| | NumBytes + // | | StackSize (StackAdjustUp) + // | LocalStackSize | | | + // | (covering callee | | | + // | args) | | | + // | | | | + // ---------------------- --- --- + // + // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize + // = StackSize + ArgumentPopSize + // + // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps + // it as the 2nd argument of AArch64ISD::TC_RETURN. + NumBytes += ArgumentPopSize; + + unsigned NumRestores = 0; + // Move past the restores of the callee-saved registers. + MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastPopI != Begin) { + --LastPopI; + unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); + NumRestores += Restores; + if (Restores == 0) { + ++LastPopI; + break; + } + } + NumBytes -= NumRestores * 8; + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + if (!hasFP(MF)) { + // If this was a redzone leaf function, we don't need to restore the + // stack pointer. + if (!canUseRedZone(MF)) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, + TII); + return; + } + + // Restore the original stack pointer. + // FIXME: Rather than doing the math here, we should instead just use + // non-post-indexed loads for the restores if we aren't actually going to + // be able to save any instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, + -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); +} + +/// getFrameIndexReference - Provide a base+offset reference to an FI slot for +/// debug info. It's the same as what we use for resolving the code-gen +/// references for now. FIXME: This can go wrong when references are +/// SP-relative and simple call frames aren't used. +int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + return resolveFrameIndexReference(MF, FI, FrameReg); +} + +int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, + bool PreferFP) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + int FPOffset = MFI->getObjectOffset(FI) + 16; + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + // Use frame pointer to reference fixed objects. Use it for locals if + // there are VLAs or a dynamically realigned SP (and thus the SP isn't + // reliable as a base). Make sure useFPForScavengingIndex() does the + // right thing for the emergency spill slot. + bool UseFP = false; + if (AFI->hasStackFrame()) { + // Note: Keeping the following as multiple 'if' statements rather than + // merging to a single expression for readability. + // + // Argument access should always use the FP. + if (isFixed) { + UseFP = hasFP(MF); + } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) && + !RegInfo->needsStackRealignment(MF)) { + // Use SP or FP, whichever gives us the best chance of the offset + // being in range for direct access. If the FPOffset is positive, + // that'll always be best, as the SP will be even further away. + // If the FPOffset is negative, we have to keep in mind that the + // available offset range for negative offsets is smaller than for + // positive ones. If we have variable sized objects, we're stuck with + // using the FP regardless, though, as the SP offset is unknown + // and we don't have a base pointer available. If an offset is + // available via the FP and the SP, use whichever is closest. + if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 || + (FPOffset >= -256 && Offset > -FPOffset)) + UseFP = true; + } + } + + assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) && + "In the presence of dynamic stack pointer realignment, " + "non-argument objects cannot be accessed through the frame pointer"); + + if (UseFP) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + + // Use the base pointer if we have one. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else { + FrameReg = AArch64::SP; + // If we're using the red zone for this function, the SP won't actually + // be adjusted, so the offsets will be negative. They're also all + // within range of the signed 9-bit immediate instructions. + if (canUseRedZone(MF)) + Offset -= AFI->getLocalStackSize(); + } + + return Offset; +} + +static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { + if (Reg != AArch64::LR) + return getKillRegState(true); + + // LR maybe referred to later by an @llvm.returnaddress intrinsic. + bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); + bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); + return getKillRegState(LRKill); +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned Count = CSI.size(); + DebugLoc DL; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + + for (unsigned i = 0; i < Count; i += 2) { + unsigned idx = Count - i - 2; + unsigned Reg1 = CSI[idx].getReg(); + unsigned Reg2 = CSI[idx + 1].getReg(); + // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI + // list to come in sorted by frame index so that we can issue the store + // pair instructions directly. Assert if we see anything otherwise. + // + // The order of the registers in the list is controlled by + // getCalleeSavedRegs(), so they will always be in-order, as well. + assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + "Out of order callee saved regs!"); + unsigned StrOpc; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); + // Issue sequence of non-sp increment and pi sp spills for cs regs. The + // first spill is a pre-increment that allocates the stack. + // For example: + // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x20, x19, [sp, #16] // addImm(+2) + // stp fp, lr, [sp, #32] // addImm(+4) + // Rationale: This sequence saves uop updates compared to a sequence of + // pre-increment spills like stp xi,xj,[sp,#-16]! + // Note: Similar rational and sequence for restores in epilog. + if (AArch64::GPR64RegClass.contains(Reg1)) { + assert(AArch64::GPR64RegClass.contains(Reg2) && + "Expected GPR64 callee-saved register pair!"); + // For first spill use pre-increment store. + if (i == 0) + StrOpc = AArch64::STPXpre; + else + StrOpc = AArch64::STPXi; + } else if (AArch64::FPR64RegClass.contains(Reg1)) { + assert(AArch64::FPR64RegClass.contains(Reg2) && + "Expected FPR64 callee-saved register pair!"); + // For first spill use pre-increment store. + if (i == 0) + StrOpc = AArch64::STPDpre; + else + StrOpc = AArch64::STPDi; + } else + llvm_unreachable("Unexpected callee saved register!"); + DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " + << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() + << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); + // Compute offset: i = 0 => offset = -Count; + // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. + const int Offset = (i == 0) ? -Count : i; + assert((Offset >= -64 && Offset <= 63) && + "Offset out of bounds for STP immediate"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); + if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) + MIB.addReg(AArch64::SP, RegState::Define); + + MBB.addLiveIn(Reg1); + MBB.addLiveIn(Reg2); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) + .addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::SP) + .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .setMIFlag(MachineInstr::FrameSetup); + } + return true; +} + +bool AArch64FrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + unsigned Count = CSI.size(); + DebugLoc DL; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + + if (MI != MBB.end()) + DL = MI->getDebugLoc(); + + for (unsigned i = 0; i < Count; i += 2) { + unsigned Reg1 = CSI[i].getReg(); + unsigned Reg2 = CSI[i + 1].getReg(); + // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI + // list to come in sorted by frame index so that we can issue the store + // pair instructions directly. Assert if we see anything otherwise. + assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && + "Out of order callee saved regs!"); + // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only + // the last load is sp-pi post-increment and de-allocates the stack: + // For example: + // ldp fp, lr, [sp, #32] // addImm(+4) + // ldp x20, x19, [sp, #16] // addImm(+2) + // ldp x22, x21, [sp], #48 // addImm(+6) + // Note: see comment in spillCalleeSavedRegisters() + unsigned LdrOpc; + + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); + if (AArch64::GPR64RegClass.contains(Reg1)) { + assert(AArch64::GPR64RegClass.contains(Reg2) && + "Expected GPR64 callee-saved register pair!"); + if (i == Count - 2) + LdrOpc = AArch64::LDPXpost; + else + LdrOpc = AArch64::LDPXi; + } else if (AArch64::FPR64RegClass.contains(Reg1)) { + assert(AArch64::FPR64RegClass.contains(Reg2) && + "Expected FPR64 callee-saved register pair!"); + if (i == Count - 2) + LdrOpc = AArch64::LDPDpost; + else + LdrOpc = AArch64::LDPDi; + } else + llvm_unreachable("Unexpected callee saved register!"); + DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " + << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() + << ", " << CSI[i + 1].getFrameIdx() << ")\n"); + + // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; + // etc. + const int Offset = (i == Count - 2) ? Count : Count - i - 2; + assert((Offset >= -64 && Offset <= 63) && + "Offset out of bounds for LDP immediate"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) + MIB.addReg(AArch64::SP, RegState::Define); + + MIB.addReg(Reg2, getDefRegState(true)) + .addReg(Reg1, getDefRegState(true)) + .addReg(AArch64::SP) + .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] + // where the factor * 8 is implicit + } + return true; +} + +void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + SmallVector<unsigned, 4> UnspilledCSGPRs; + SmallVector<unsigned, 4> UnspilledCSFPRs; + + // The frame record needs to be created by saving the appropriate registers + if (hasFP(MF)) { + SavedRegs.set(AArch64::FP); + SavedRegs.set(AArch64::LR); + } + + // Spill the BasePtr if it's used. Do this first thing so that the + // getCalleeSavedRegs() below will get the right answer. + if (RegInfo->hasBasePointer(MF)) + SavedRegs.set(RegInfo->getBaseRegister()); + + if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF)) + SavedRegs.set(AArch64::X9); + + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumGPRSpilled = 0; + unsigned NumFPRSpilled = 0; + bool ExtraCSSpill = false; + bool CanEliminateFrame = true; + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + + // Check pairs of consecutive callee-saved registers. + for (unsigned i = 0; CSRegs[i]; i += 2) { + assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); + + const unsigned OddReg = CSRegs[i]; + const unsigned EvenReg = CSRegs[i + 1]; + assert((AArch64::GPR64RegClass.contains(OddReg) && + AArch64::GPR64RegClass.contains(EvenReg)) ^ + (AArch64::FPR64RegClass.contains(OddReg) && + AArch64::FPR64RegClass.contains(EvenReg)) && + "Register class mismatch!"); + + const bool OddRegUsed = SavedRegs.test(OddReg); + const bool EvenRegUsed = SavedRegs.test(EvenReg); + + // Early exit if none of the registers in the register pair is actually + // used. + if (!OddRegUsed && !EvenRegUsed) { + if (AArch64::GPR64RegClass.contains(OddReg)) { + UnspilledCSGPRs.push_back(OddReg); + UnspilledCSGPRs.push_back(EvenReg); + } else { + UnspilledCSFPRs.push_back(OddReg); + UnspilledCSFPRs.push_back(EvenReg); + } + continue; + } + + unsigned Reg = AArch64::NoRegister; + // If only one of the registers of the register pair is used, make sure to + // mark the other one as used as well. + if (OddRegUsed ^ EvenRegUsed) { + // Find out which register is the additional spill. + Reg = OddRegUsed ? EvenReg : OddReg; + SavedRegs.set(Reg); + } + + DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); + DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); + + assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || + (RegInfo->getEncodingValue(OddReg) + 1 == + RegInfo->getEncodingValue(EvenReg))) && + "Register pair of non-adjacent registers!"); + if (AArch64::GPR64RegClass.contains(OddReg)) { + NumGPRSpilled += 2; + // If it's not a reserved register, we can use it in lieu of an + // emergency spill slot for the register scavenger. + // FIXME: It would be better to instead keep looking and choose another + // unspilled register that isn't reserved, if there is one. + if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } else + NumFPRSpilled += 2; + + CanEliminateFrame = false; + } + + // FIXME: Set BigStack if any stack slot references may be out of range. + // For now, just conservatively guestimate based on unscaled indexing + // range. We'll end up allocating an unnecessary spill slot a lot, but + // realistically that's not a big deal at this stage of the game. + // The CSR spill slots have not been allocated yet, so estimateStackSize + // won't include them. + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned CFSize = + MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); + bool BigStack = (CFSize >= 256); + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) + AFI->setHasStackFrame(true); + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. If we already spilled an extra callee-saved register + // above to keep the number of spills even, we don't need to do anything else + // here. + if (BigStack && !ExtraCSSpill) { + + // If we're adding a register to spill here, we have to add two of them + // to keep the number of regs to spill even. + assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); + unsigned Count = 0; + while (!UnspilledCSGPRs.empty() && Count < 2) { + unsigned Reg = UnspilledCSGPRs.back(); + UnspilledCSGPRs.pop_back(); + DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) + << " to get a scratch register.\n"); + SavedRegs.set(Reg); + ExtraCSSpill = true; + ++Count; + } + + // If we didn't find an extra callee-saved register to spill, create + // an emergency spill slot. + if (!ExtraCSSpill) { + const TargetRegisterClass *RC = &AArch64::GPR64RegClass; + int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false); + RS->addScavengingFrameIndex(FI); + DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI + << " as the emergency spill slot.\n"); + } + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h new file mode 100644 index 0000000..427afdf --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -0,0 +1,72 @@ +//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class AArch64FrameLowering : public TargetFrameLowering { +public: + explicit AArch64FrameLowering() + : TargetFrameLowering(StackGrowsDown, 16, 0, 16, + true /*StackRealignable*/) {} + + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned FramePtr) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + int resolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, + bool PreferFP = false) const; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + /// \brief Can this function use the red zone for local allocations. + bool canUseRedZone(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp new file mode 100644 index 0000000..6c86888 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -0,0 +1,3313 @@ +//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the AArch64 target. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" // To access function attributes. +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-isel" + +//===--------------------------------------------------------------------===// +/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine +/// instructions for SelectionDAG operations. +/// +namespace { + +class AArch64DAGToDAGISel : public SelectionDAGISel { + + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + + bool ForCodeSize; + +public: + explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, + CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), + ForCodeSize(false) {} + + const char *getPassName() const override { + return "AArch64 Instruction Selection"; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + ForCodeSize = MF.getFunction()->optForSize(); + Subtarget = &MF.getSubtarget<AArch64Subtarget>(); + return SelectionDAGISel::runOnMachineFunction(MF); + } + + SDNode *Select(SDNode *Node) override; + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; + + SDNode *SelectMLAV64LaneV128(SDNode *N); + SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N); + bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); + bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); + bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); + bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { + return SelectShiftedRegister(N, false, Reg, Shift); + } + bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { + return SelectShiftedRegister(N, true, Reg, Shift); + } + bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 16, Base, OffImm); + } + bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 16, Base, OffImm); + } + bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 1, Base, OffImm); + } + bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 2, Base, OffImm); + } + bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 4, Base, OffImm); + } + bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 8, Base, OffImm); + } + bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 16, Base, OffImm); + } + + template<int Width> + bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &SignExtend, SDValue &DoShift) { + return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift); + } + + template<int Width> + bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &SignExtend, SDValue &DoShift) { + return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); + } + + + /// Form sequences of consecutive 64/128-bit registers for use in NEON + /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have + /// between 1 and 4 elements. If it contains a single element that is returned + /// unchanged; otherwise a REG_SEQUENCE value is returned. + SDValue createDTuple(ArrayRef<SDValue> Vecs); + SDValue createQTuple(ArrayRef<SDValue> Vecs); + + /// Generic helper for the createDTuple/createQTuple + /// functions. Those should almost always be called instead. + SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[], + const unsigned SubRegs[]); + + SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); + + SDNode *SelectIndexedLoad(SDNode *N, bool &Done); + + SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx); + SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx); + SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + + SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + + SDNode *SelectBitfieldExtractOp(SDNode *N); + SDNode *SelectBitfieldInsertOp(SDNode *N); + SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); + + SDNode *SelectReadRegister(SDNode *N); + SDNode *SelectWriteRegister(SDNode *N); + +// Include the pieces autogenerated from the target description. +#include "AArch64GenDAGISel.inc" + +private: + bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, + SDValue &Shift); + bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); + bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); + bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); + bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base, + SDValue &Offset, SDValue &SignExtend, + SDValue &DoShift); + bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, + SDValue &Offset, SDValue &SignExtend, + SDValue &DoShift); + bool isWorthFolding(SDValue V) const; + bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, + SDValue &Offset, SDValue &SignExtend); + + template<unsigned RegWidth> + bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) { + return SelectCVTFixedPosOperand(N, FixedPos, RegWidth); + } + + bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); +}; +} // end anonymous namespace + +/// isIntImmediate - This method tests to see if the node is a constant +/// operand. If so Imm will receive the 32-bit value. +static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { + if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) { + Imm = C->getZExtValue(); + return true; + } + return false; +} + +// isIntImmediate - This method tests to see if a constant operand. +// If so Imm will receive the value. +static bool isIntImmediate(SDValue N, uint64_t &Imm) { + return isIntImmediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, + uint64_t &Imm) { + return N->getOpcode() == Opc && + isIntImmediate(N->getOperand(1).getNode(), Imm); +} + +bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( + const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { + switch(ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + case InlineAsm::Constraint_Q: + // Require the address to be in a register. That is safe for all AArch64 + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; + } + return true; +} + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val, + SDValue &Shift) { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + if (!isa<ConstantSDNode>(N.getNode())) + return false; + + uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); + unsigned ShiftAmt; + + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return false; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + SDLoc dl(N); + Val = CurDAG->getTargetConstant(Immed, dl, MVT::i32); + Shift = CurDAG->getTargetConstant(ShVal, dl, MVT::i32); + return true; +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val, + SDValue &Shift) { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + if (!isa<ConstantSDNode>(N.getNode())) + return false; + + // The immediate operand must be a 24-bit zero-extended immediate. + uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); + + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) + return false; + + if (N.getValueType() == MVT::i32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + if (Immed & 0xFFFFFFFFFF000000ULL) + return false; + + Immed &= 0xFFFFFFULL; + return SelectArithImmed(CurDAG->getConstant(Immed, SDLoc(N), MVT::i32), Val, + Shift); +} + +/// getShiftTypeForNode - Translate a shift node to the corresponding +/// ShiftType value. +static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { + switch (N.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case ISD::SHL: + return AArch64_AM::LSL; + case ISD::SRL: + return AArch64_AM::LSR; + case ISD::SRA: + return AArch64_AM::ASR; + case ISD::ROTR: + return AArch64_AM::ROR; + } +} + +/// \brief Determine whether it is worth to fold V into an extended register. +bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { + // it hurts if the value is used at least twice, unless we are optimizing + // for code size. + if (ForCodeSize || V.hasOneUse()) + return true; + return false; +} + +/// SelectShiftedRegister - Select a "shifted register" operand. If the value +/// is not shifted, set the Shift operand to default of "LSL 0". The logical +/// instructions allow the shifted register to be rotated, but the arithmetic +/// instructions do not. The AllowROR parameter specifies whether ROR is +/// supported. +bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR, + SDValue &Reg, SDValue &Shift) { + AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N); + if (ShType == AArch64_AM::InvalidShiftExtend) + return false; + if (!AllowROR && ShType == AArch64_AM::ROR) + return false; + + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + unsigned BitSize = N.getValueType().getSizeInBits(); + unsigned Val = RHS->getZExtValue() & (BitSize - 1); + unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val); + + Reg = N.getOperand(0); + Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32); + return isWorthFolding(N); + } + + return false; +} + +/// getExtendTypeForNode - Translate an extend node to the corresponding +/// ExtendType value. +static AArch64_AM::ShiftExtendType +getExtendTypeForNode(SDValue N, bool IsLoadStore = false) { + if (N.getOpcode() == ISD::SIGN_EXTEND || + N.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT SrcVT; + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG) + SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT(); + else + SrcVT = N.getOperand(0).getValueType(); + + if (!IsLoadStore && SrcVT == MVT::i8) + return AArch64_AM::SXTB; + else if (!IsLoadStore && SrcVT == MVT::i16) + return AArch64_AM::SXTH; + else if (SrcVT == MVT::i32) + return AArch64_AM::SXTW; + assert(SrcVT != MVT::i64 && "extend from 64-bits?"); + + return AArch64_AM::InvalidShiftExtend; + } else if (N.getOpcode() == ISD::ZERO_EXTEND || + N.getOpcode() == ISD::ANY_EXTEND) { + EVT SrcVT = N.getOperand(0).getValueType(); + if (!IsLoadStore && SrcVT == MVT::i8) + return AArch64_AM::UXTB; + else if (!IsLoadStore && SrcVT == MVT::i16) + return AArch64_AM::UXTH; + else if (SrcVT == MVT::i32) + return AArch64_AM::UXTW; + assert(SrcVT != MVT::i64 && "extend from 64-bits?"); + + return AArch64_AM::InvalidShiftExtend; + } else if (N.getOpcode() == ISD::AND) { + ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!CSD) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = CSD->getZExtValue(); + + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; + case 0xFFFF: + return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } + } + + return AArch64_AM::InvalidShiftExtend; +} + +// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts. +static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { + if (DL->getOpcode() != AArch64ISD::DUPLANE16 && + DL->getOpcode() != AArch64ISD::DUPLANE32) + return false; + + SDValue SV = DL->getOperand(0); + if (SV.getOpcode() != ISD::INSERT_SUBVECTOR) + return false; + + SDValue EV = SV.getOperand(1); + if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode()); + ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode()); + LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue(); + LaneOp = EV.getOperand(0); + + return true; +} + +// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a +// high lane extract. +static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, + SDValue &LaneOp, int &LaneIdx) { + + if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) { + std::swap(Op0, Op1); + if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) + return false; + } + StdOp = Op1; + return true; +} + +/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand +/// is a lane in the upper half of a 128-bit vector. Recognize and select this +/// so that we don't emit unnecessary lane extracts. +SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { + SDLoc dl(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue MLAOp1; // Will hold ordinary multiplicand for MLA. + SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA. + int LaneIdx = -1; // Will hold the lane index. + + if (Op1.getOpcode() != ISD::MUL || + !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, + LaneIdx)) { + std::swap(Op0, Op1); + if (Op1.getOpcode() != ISD::MUL || + !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, + LaneIdx)) + return nullptr; + } + + SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); + + SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal }; + + unsigned MLAOpc = ~0U; + + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized MLA."); + case MVT::v4i16: + MLAOpc = AArch64::MLAv4i16_indexed; + break; + case MVT::v8i16: + MLAOpc = AArch64::MLAv8i16_indexed; + break; + case MVT::v2i32: + MLAOpc = AArch64::MLAv2i32_indexed; + break; + case MVT::v4i32: + MLAOpc = AArch64::MLAv4i32_indexed; + break; + } + + return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { + SDLoc dl(N); + SDValue SMULLOp0; + SDValue SMULLOp1; + int LaneIdx; + + if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1, + LaneIdx)) + return nullptr; + + SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64); + + SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal }; + + unsigned SMULLOpc = ~0U; + + if (IntNo == Intrinsic::aarch64_neon_smull) { + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized SMULL."); + case MVT::v4i32: + SMULLOpc = AArch64::SMULLv4i16_indexed; + break; + case MVT::v2i64: + SMULLOpc = AArch64::SMULLv2i32_indexed; + break; + } + } else if (IntNo == Intrinsic::aarch64_neon_umull) { + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized SMULL."); + case MVT::v4i32: + SMULLOpc = AArch64::UMULLv4i16_indexed; + break; + case MVT::v2i64: + SMULLOpc = AArch64::UMULLv2i32_indexed; + break; + } + } else + llvm_unreachable("Unrecognized intrinsic."); + + return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops); +} + +/// Instructions that accept extend modifiers like UXTW expect the register +/// being extended to be a GPR32, but the incoming DAG might be acting on a +/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if +/// this is the case. +static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) { + if (N.getValueType() == MVT::i32) + return N; + + SDLoc dl(N); + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); + MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + dl, MVT::i32, N, SubReg); + return SDValue(Node, 0); +} + + +/// SelectArithExtendedRegister - Select a "extended register" operand. This +/// operand folds in an extend followed by an optional left shift. +bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, + SDValue &Shift) { + unsigned ShiftVal = 0; + AArch64_AM::ShiftExtendType Ext; + + if (N.getOpcode() == ISD::SHL) { + ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!CSD) + return false; + ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 4) + return false; + + Ext = getExtendTypeForNode(N.getOperand(0)); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Reg = N.getOperand(0).getOperand(0); + } else { + Ext = getExtendTypeForNode(N); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Reg = N.getOperand(0); + } + + // AArch64 mandates that the RHS of the operation must use the smallest + // register class that could contain the size being extended from. Thus, + // if we're folding a (sext i8), we need the RHS to be a GPR32, even though + // there might not be an actual 32-bit value in the program. We can + // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. + assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX); + Reg = narrowIfNeeded(CurDAG, Reg); + Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N), + MVT::i32); + return isWorthFolding(N); +} + +/// If there's a use of this ADDlow that's not itself a load/store then we'll +/// need to create a real ADD instruction from it anyway and there's no point in +/// folding it into the mem op. Theoretically, it shouldn't matter, but there's +/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding +/// leads to duplicated ADRP instructions. +static bool isWorthFoldingADDlow(SDValue N) { + for (auto Use : N->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && + Use->getOpcode() != ISD::ATOMIC_LOAD && + Use->getOpcode() != ISD::ATOMIC_STORE) + return false; + + // ldar and stlr have much more restrictive addressing modes (just a + // register). + if (cast<MemSDNode>(Use)->getOrdering() > Monotonic) + return false; + } + + return true; +} + +/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed + // selected here doesn't support labels/immediates, only base+offset. + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) && + RHSC < (0x40 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } + } + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // stp x1, x2, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + +/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, + SDValue &Base, SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) { + GlobalAddressSDNode *GAN = + dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode()); + Base = N.getOperand(0); + OffImm = N.getOperand(1); + if (!GAN) + return true; + + const GlobalValue *GV = GAN->getGlobal(); + unsigned Alignment = GV->getAlignment(); + Type *Ty = GV->getType()->getElementType(); + if (Alignment == 0 && Ty->isSized()) + Alignment = DL.getABITypeAlignment(Ty); + + if (Alignment >= Size) + return true; + } + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = (int64_t)RHS->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } + } + + // Before falling back to our general case, check if the unscaled + // instructions can handle this. If so, that's preferable. + if (SelectAddrModeUnscaled(N, Size, Base, OffImm)) + return false; + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // ldr x0, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + +/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit +/// immediate" address. This should only match when there is an offset that +/// is not valid for a scaled immediate addressing mode. The "Size" argument +/// is the size in bytes of the memory reference, which is needed here to know +/// what is valid for a scaled immediate. +bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + // If the offset is valid as a scaled immediate, don't match here. + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && + RHSC < (0x1000 << Log2_32(Size))) + return false; + if (RHSC >= -256 && RHSC < 256) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + const TargetLowering *TLI = getTargetLowering(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i64); + return true; + } + } + return false; +} + +static SDValue Widen(SelectionDAG *CurDAG, SDValue N) { + SDLoc dl(N); + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); + SDValue ImpDef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::i64), 0); + MachineSDNode *Node = CurDAG->getMachineNode( + TargetOpcode::INSERT_SUBREG, dl, MVT::i64, ImpDef, N, SubReg); + return SDValue(Node, 0); +} + +/// \brief Check if the given SHL node (\p N), can be used to form an +/// extended register for an addressing mode. +bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, + bool WantExtend, SDValue &Offset, + SDValue &SignExtend) { + assert(N.getOpcode() == ISD::SHL && "Invalid opcode."); + ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue()) + return false; + + SDLoc dl(N); + if (WantExtend) { + AArch64_AM::ShiftExtendType Ext = + getExtendTypeForNode(N.getOperand(0), true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, + MVT::i32); + } else { + Offset = N.getOperand(0); + SignExtend = CurDAG->getTargetConstant(0, dl, MVT::i32); + } + + unsigned LegalShiftVal = Log2_32(Size); + unsigned ShiftVal = CSD->getZExtValue(); + + if (ShiftVal != 0 && ShiftVal != LegalShiftVal) + return false; + + if (isWorthFolding(N)) + return true; + + return false; +} + +bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, + SDValue &Base, SDValue &Offset, + SDValue &SignExtend, + SDValue &DoShift) { + if (N.getOpcode() != ISD::ADD) + return false; + SDValue LHS = N.getOperand(0); + SDValue RHS = N.getOperand(1); + SDLoc dl(N); + + // We don't want to match immediate adds here, because they are better lowered + // to the register-immediate addressing modes. + if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS)) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = N.getNode(); + for (SDNode *UI : Node->uses()) { + if (!isa<MemSDNode>(*UI)) + return false; + } + + // Remember if it is worth folding N when it produces extended register. + bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + + // Try to match a shifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) { + Base = LHS; + DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32); + return true; + } + + // Try to match a shifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) { + Base = RHS; + DoShift = CurDAG->getTargetConstant(true, dl, MVT::i32); + return true; + } + + // There was no shift, whatever else we find. + DoShift = CurDAG->getTargetConstant(false, dl, MVT::i32); + + AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend; + // Try to match an unshifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && + (Ext = getExtendTypeForNode(LHS, true)) != + AArch64_AM::InvalidShiftExtend) { + Base = RHS; + Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, + MVT::i32); + if (isWorthFolding(LHS)) + return true; + } + + // Try to match an unshifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && + (Ext = getExtendTypeForNode(RHS, true)) != + AArch64_AM::InvalidShiftExtend) { + Base = LHS; + Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl, + MVT::i32); + if (isWorthFolding(RHS)) + return true; + } + + return false; +} + +// Check if the given immediate is preferred by ADD. If an immediate can be +// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be +// encoded by one MOVZ, return true. +static bool isPreferredADD(int64_t ImmOff) { + // Constant in [0x0, 0xfff] can be encoded in ADD. + if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) + return true; + // Check if it can be encoded in an "ADD LSL #12". + if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL) + // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant. + return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && + (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; + return false; +} + +bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, + SDValue &Base, SDValue &Offset, + SDValue &SignExtend, + SDValue &DoShift) { + if (N.getOpcode() != ISD::ADD) + return false; + SDValue LHS = N.getOperand(0); + SDValue RHS = N.getOperand(1); + SDLoc DL(N); + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = N.getNode(); + for (SDNode *UI : Node->uses()) { + if (!isa<MemSDNode>(*UI)) + return false; + } + + // Watch out if RHS is a wide immediate, it can not be selected into + // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into + // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate + // instructions like: + // MOV X0, WideImmediate + // ADD X1, BaseReg, X0 + // LDR X2, [X1, 0] + // For such situation, using [BaseReg, XReg] addressing mode can save one + // ADD/SUB: + // MOV X0, WideImmediate + // LDR X2, [BaseReg, X0] + if (isa<ConstantSDNode>(RHS)) { + int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); + unsigned Scale = Log2_32(Size); + // Skip the immediate can be selected by load/store addressing mode. + // Also skip the immediate can be encoded by a single ADD (SUB is also + // checked by using -ImmOff). + if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) || + isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) + return false; + + SDValue Ops[] = { RHS }; + SDNode *MOVI = + CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops); + SDValue MOVIV = SDValue(MOVI, 0); + // This ADD of two X register will be selected into [Reg+Reg] mode. + N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV); + } + + // Remember if it is worth folding N when it produces extended register. + bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + + // Try to match a shifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) { + Base = LHS; + DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32); + return true; + } + + // Try to match a shifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) { + Base = RHS; + DoShift = CurDAG->getTargetConstant(true, DL, MVT::i32); + return true; + } + + // Match any non-shifted, non-extend, non-immediate add expression. + Base = LHS; + Offset = RHS; + SignExtend = CurDAG->getTargetConstant(false, DL, MVT::i32); + DoShift = CurDAG->getTargetConstant(false, DL, MVT::i32); + // Reg1 + Reg2 is free: no check needed. + return true; +} + +SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) { + static const unsigned RegClassIDs[] = { + AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; + static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3}; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) { + static const unsigned RegClassIDs[] = { + AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; + static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3}; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs, + const unsigned RegClassIDs[], + const unsigned SubRegs[]) { + // There's no special register-class for a vector-list of 1 element: it's just + // a vector. + if (Regs.size() == 1) + return Regs[0]; + + assert(Regs.size() >= 2 && Regs.size() <= 4); + + SDLoc DL(Regs[0]); + + SmallVector<SDValue, 4> Ops; + + // First operand of REG_SEQUENCE is the desired RegClass. + Ops.push_back( + CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], DL, MVT::i32)); + + // Then we get pairs of source & subregister-position for the components. + for (unsigned i = 0; i < Regs.size(); ++i) { + Ops.push_back(Regs[i]); + Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], DL, MVT::i32)); + } + + SDNode *N = + CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops); + return SDValue(N, 0); +} + +SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, + unsigned Opc, bool isExt) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + + unsigned ExtOff = isExt; + + // Form a REG_SEQUENCE to force register allocation. + unsigned Vec0Off = ExtOff + 1; + SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off, + N->op_begin() + Vec0Off + NumVecs); + SDValue RegSeq = createQTuple(Regs); + + SmallVector<SDValue, 6> Ops; + if (isExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(NumVecs + ExtOff + 1)); + return CurDAG->getMachineNode(Opc, dl, VT, Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { + LoadSDNode *LD = cast<LoadSDNode>(N); + if (LD->isUnindexed()) + return nullptr; + EVT VT = LD->getMemoryVT(); + EVT DstVT = N->getValueType(0); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC; + + // We're not doing validity checking here. That was done when checking + // if we should mark the load as indexed or not. We're just selecting + // the right instruction. + unsigned Opcode = 0; + + ISD::LoadExtType ExtType = LD->getExtensionType(); + bool InsertTo64 = false; + if (VT == MVT::i64) + Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost; + else if (VT == MVT::i32) { + if (ExtType == ISD::NON_EXTLOAD) + Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; + else if (ExtType == ISD::SEXTLOAD) + Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost; + else { + Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; + InsertTo64 = true; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::i16) { + if (ExtType == ISD::SEXTLOAD) { + if (DstVT == MVT::i64) + Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost; + else + Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost; + } else { + Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost; + InsertTo64 = DstVT == MVT::i64; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::i8) { + if (ExtType == ISD::SEXTLOAD) { + if (DstVT == MVT::i64) + Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost; + else + Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost; + } else { + Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost; + InsertTo64 = DstVT == MVT::i64; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::f16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; + } else if (VT == MVT::f32) { + Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; + } else if (VT == MVT::f64 || VT.is64BitVector()) { + Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost; + } else if (VT.is128BitVector()) { + Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost; + } else + return nullptr; + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset()); + int OffsetVal = (int)OffsetOp->getZExtValue(); + SDLoc dl(N); + SDValue Offset = CurDAG->getTargetConstant(OffsetVal, dl, MVT::i64); + SDValue Ops[] = { Base, Offset, Chain }; + SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT, + MVT::Other, Ops); + // Either way, we're replacing the node, so tell the caller that. + Done = true; + SDValue LoadedVal = SDValue(Res, 1); + if (InsertTo64) { + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); + LoadedVal = + SDValue(CurDAG->getMachineNode( + AArch64::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), LoadedVal, + SubReg), + 0); + } + + ReplaceUses(SDValue(N, 0), LoadedVal); + ReplaceUses(SDValue(N, 1), SDValue(Res, 0)); + ReplaceUses(SDValue(N, 2), SDValue(Res, 2)); + + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(2), // Mem operand; + Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + SDValue SuperReg = SDValue(Ld, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SDValue Ops[] = {N->getOperand(1), // Mem operand + N->getOperand(2), // Incremental + Chain}; + + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Untyped, MVT::Other}; + + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of vector list + SDValue SuperReg = SDValue(Ld, 1); + if (NumVecs == 1) + ReplaceUses(SDValue(N, 0), SuperReg); + else + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + + // Update the chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + bool Is128Bit = VT.getSizeInBits() == 128; + SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + + SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)}; + SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); + + return St; +} + +SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; // Type for the Chain + + // Form a REG_SEQUENCE to force register allocation. + bool Is128Bit = VT.getSizeInBits() == 128; + SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + + SDValue Ops[] = {RegSeq, + N->getOperand(NumVecs + 1), // base register + N->getOperand(NumVecs + 2), // Incremental + N->getOperand(0)}; // Chain + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + return St; +} + +namespace { +/// WidenVector - Given a value in the V64 register class, produce the +/// equivalent value in the V128 register class. +class WidenVector { + SelectionDAG &DAG; + +public: + WidenVector(SelectionDAG &DAG) : DAG(DAG) {} + + SDValue operator()(SDValue V64Reg) { + EVT VT = V64Reg.getValueType(); + unsigned NarrowSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); + SDLoc DL(V64Reg); + + SDValue Undef = + SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0); + return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg); + } +}; +} // namespace + +/// NarrowVector - Given a value in the V128 register class, produce the +/// equivalent value in the V64 register class. +static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { + EVT VT = V128Reg.getValueType(); + unsigned WideSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); + + return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy, + V128Reg); +} + +SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + unsigned LaneNo = + cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue(); + + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64), + N->getOperand(NumVecs + 3), N->getOperand(0)}; + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + SDValue SuperReg = SDValue(Ld, 0); + + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); + if (Narrow) + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); + } + + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + + return Ld; +} + +SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + const EVT ResTys[] = {MVT::i64, // Type of the write back register + RegSeq->getValueType(0), MVT::Other}; + + unsigned LaneNo = + cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); + + SDValue Ops[] = {RegSeq, + CurDAG->getTargetConstant(LaneNo, dl, + MVT::i64), // Lane Number + N->getOperand(NumVecs + 2), // Base register + N->getOperand(NumVecs + 3), // Incremental + N->getOperand(0)}; + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of the write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of the vector list + SDValue SuperReg = SDValue(Ld, 1); + if (NumVecs == 1) { + ReplaceUses(SDValue(N, 0), + Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); + } else { + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, + SuperReg); + if (Narrow) + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); + } + } + + // Update the Chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); + + return Ld; +} + +SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + unsigned LaneNo = + cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue(); + + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64), + N->getOperand(NumVecs + 3), N->getOperand(0)}; + SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; +} + +SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + const EVT ResTys[] = {MVT::i64, // Type of the write back register + MVT::Other}; + + unsigned LaneNo = + cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue(); + + SDValue Ops[] = {RegSeq, CurDAG->getTargetConstant(LaneNo, dl, MVT::i64), + N->getOperand(NumVecs + 2), // Base Register + N->getOperand(NumVecs + 3), // Incremental + N->getOperand(0)}; + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; +} + +static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, + unsigned &Opc, SDValue &Opd0, + unsigned &LSB, unsigned &MSB, + unsigned NumberOfIgnoredLowBits, + bool BiggerPattern) { + assert(N->getOpcode() == ISD::AND && + "N must be a AND operation to call this function"); + + EVT VT = N->getValueType(0); + + // Here we can test the type of VT and return false when the type does not + // match, but since it is done prior to that call in the current context + // we turned that into an assert to avoid redundant code. + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + // FIXME: simplify-demanded-bits in DAGCombine will probably have + // changed the AND node to a 32-bit mask operation. We'll have to + // undo that as part of the transform here if we want to catch all + // the opportunities. + // Currently the NumberOfIgnoredLowBits argument helps to recover + // form these situations when matching bigger pattern (bitfield insert). + + // For unsigned extracts, check for a shift right and mask + uint64_t And_imm = 0; + if (!isOpcWithIntImmediate(N, ISD::AND, And_imm)) + return false; + + const SDNode *Op0 = N->getOperand(0).getNode(); + + // Because of simplify-demanded-bits in DAGCombine, the mask may have been + // simplified. Try to undo that + And_imm |= (1 << NumberOfIgnoredLowBits) - 1; + + // The immediate is a mask of the low bits iff imm & (imm+1) == 0 + if (And_imm & (And_imm + 1)) + return false; + + bool ClampMSB = false; + uint64_t Srl_imm = 0; + // Handle the SRL + ANY_EXTEND case. + if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND && + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { + // Extend the incoming operand of the SRL to 64-bit. + Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0)); + // Make sure to clamp the MSB so that we preserve the semantics of the + // original operations. + ClampMSB = true; + } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + // If the shift result was truncated, we can still combine them. + Opd0 = Op0->getOperand(0).getOperand(0); + + // Use the type of SRL node. + VT = Opd0->getValueType(0); + } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { + Opd0 = Op0->getOperand(0); + } else if (BiggerPattern) { + // Let's pretend a 0 shift right has been performed. + // The resulting code will be at least as good as the original one + // plus it may expose more opportunities for bitfield insert pattern. + // FIXME: Currently we limit this to the bigger pattern, because + // some optimizations expect AND and not UBFM. + Opd0 = N->getOperand(0); + } else + return false; + + // Bail out on large immediates. This happens when no proper + // combining/constant folding was performed. + if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) { + DEBUG((dbgs() << N + << ": Found large shift immediate, this should not happen\n")); + return false; + } + + LSB = Srl_imm; + MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm) + : countTrailingOnes<uint64_t>(And_imm)) - + 1; + if (ClampMSB) + // Since we're moving the extend before the right shift operation, we need + // to clamp the MSB to make sure we don't shift in undefined bits instead of + // the zeros which would get shifted in with the original right shift + // operation. + MSB = MSB > 31 ? 31 : MSB; + + Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri; + return true; +} + +static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &LSB, + unsigned &MSB) { + // We are looking for the following pattern which basically extracts several + // continuous bits from the source value and places it from the LSB of the + // destination value, all other bits of the destination value or set to zero: + // + // Value2 = AND Value, MaskImm + // SRL Value2, ShiftImm + // + // with MaskImm >> ShiftImm to search for the bit width. + // + // This gets selected into a single UBFM: + // + // UBFM Value, ShiftImm, BitWide + Srl_imm -1 + // + + if (N->getOpcode() != ISD::SRL) + return false; + + uint64_t And_mask = 0; + if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask)) + return false; + + Opd0 = N->getOperand(0).getOperand(0); + + uint64_t Srl_imm = 0; + if (!isIntImmediate(N->getOperand(1), Srl_imm)) + return false; + + // Check whether we really have several bits extract here. + unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm)); + if (BitWide && isMask_64(And_mask >> Srl_imm)) { + if (N->getValueType(0) == MVT::i32) + Opc = AArch64::UBFMWri; + else + Opc = AArch64::UBFMXri; + + LSB = Srl_imm; + MSB = BitWide + Srl_imm - 1; + return true; + } + + return false; +} + +static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, + unsigned &Immr, unsigned &Imms, + bool BiggerPattern) { + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "N must be a SHR/SRA operation to call this function"); + + EVT VT = N->getValueType(0); + + // Here we can test the type of VT and return false when the type does not + // match, but since it is done prior to that call in the current context + // we turned that into an assert to avoid redundant code. + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + // Check for AND + SRL doing several bits extract. + if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms)) + return true; + + // we're looking for a shift of a shift + uint64_t Shl_imm = 0; + uint64_t Trunc_bits = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + Opd0 = N->getOperand(0).getOperand(0); + } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && + N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { + // We are looking for a shift of truncate. Truncate from i64 to i32 could + // be considered as setting high 32 bits as zero. Our strategy here is to + // always generate 64bit UBFM. This consistency will help the CSE pass + // later find more redundancy. + Opd0 = N->getOperand(0).getOperand(0); + Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); + VT = Opd0->getValueType(0); + assert(VT == MVT::i64 && "the promoted type should be i64"); + } else if (BiggerPattern) { + // Let's pretend a 0 shift left has been performed. + // FIXME: Currently we limit this to the bigger pattern case, + // because some optimizations expect AND and not UBFM + Opd0 = N->getOperand(0); + } else + return false; + + // Missing combines/constant folding may have left us with strange + // constants. + if (Shl_imm >= VT.getSizeInBits()) { + DEBUG((dbgs() << N + << ": Found large shift immediate, this should not happen\n")); + return false; + } + + uint64_t Srl_imm = 0; + if (!isIntImmediate(N->getOperand(1), Srl_imm)) + return false; + + assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && + "bad amount in shift node!"); + int immr = Srl_imm - Shl_imm; + Immr = immr < 0 ? immr + VT.getSizeInBits() : immr; + Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1; + // SRA requires a signed extraction + if (VT == MVT::i32) + Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri; + else + Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri; + return true; +} + +static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &Immr, unsigned &Imms, + unsigned NumberOfIgnoredLowBits = 0, + bool BiggerPattern = false) { + if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) + return false; + + switch (N->getOpcode()) { + default: + if (!N->isMachineOpcode()) + return false; + break; + case ISD::AND: + return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, Immr, Imms, + NumberOfIgnoredLowBits, BiggerPattern); + case ISD::SRL: + case ISD::SRA: + return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern); + } + + unsigned NOpc = N->getMachineOpcode(); + switch (NOpc) { + default: + return false; + case AArch64::SBFMWri: + case AArch64::UBFMWri: + case AArch64::SBFMXri: + case AArch64::UBFMXri: + Opc = NOpc; + Opd0 = N->getOperand(0); + Immr = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + Imms = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + return true; + } + // Unreachable + return false; +} + +SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { + unsigned Opc, Immr, Imms; + SDValue Opd0; + if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms)) + return nullptr; + + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // If the bit extract operation is 64bit but the original type is 32bit, we + // need to add one EXTRACT_SUBREG. + if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) { + SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, MVT::i64), + CurDAG->getTargetConstant(Imms, dl, MVT::i64)}; + + SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64); + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32); + MachineSDNode *Node = + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32, + SDValue(BFM, 0), SubReg); + return Node; + } + + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT), + CurDAG->getTargetConstant(Imms, dl, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); +} + +/// Does DstMask form a complementary pair with the mask provided by +/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, +/// this asks whether DstMask zeroes precisely those bits that will be set by +/// the other half. +static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, + unsigned NumberOfIgnoredHighBits, EVT VT) { + assert((VT == MVT::i32 || VT == MVT::i64) && + "i32 or i64 mask type expected!"); + unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits; + + APInt SignificantDstMask = APInt(BitWidth, DstMask); + APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth); + + return (SignificantDstMask & SignificantBitsToBeInserted) == 0 && + (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue(); +} + +// Look for bits that will be useful for later uses. +// A bit is consider useless as soon as it is dropped and never used +// before it as been dropped. +// E.g., looking for useful bit of x +// 1. y = x & 0x7 +// 2. z = y >> 2 +// After #1, x useful bits are 0x7, then the useful bits of x, live through +// y. +// After #2, the useful bits of x are 0x4. +// However, if x is used on an unpredicatable instruction, then all its bits +// are useful. +// E.g. +// 1. y = x & 0x7 +// 2. z = y >> 2 +// 3. str x, [@x] +static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0); + +static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue(); + Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth()); + UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm); + getUsefulBits(Op, UsefulBits, Depth + 1); +} + +static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits, + uint64_t Imm, uint64_t MSB, + unsigned Depth) { + // inherit the bitwidth value + APInt OpUsefulBits(UsefulBits); + OpUsefulBits = 1; + + if (MSB >= Imm) { + OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); + --OpUsefulBits; + // The interesting part will be in the lower part of the result + getUsefulBits(Op, OpUsefulBits, Depth + 1); + // The interesting part was starting at Imm in the argument + OpUsefulBits = OpUsefulBits.shl(Imm); + } else { + OpUsefulBits = OpUsefulBits.shl(MSB + 1); + --OpUsefulBits; + // The interesting part will be shifted in the result + OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm); + getUsefulBits(Op, OpUsefulBits, Depth + 1); + // The interesting part was at zero in the argument + OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm); + } + + UsefulBits &= OpUsefulBits; +} + +static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue(); + uint64_t MSB = + cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue(); + + getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); +} + +static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t ShiftTypeAndValue = + cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue(); + APInt Mask(UsefulBits); + Mask.clearAllBits(); + Mask.flipAllBits(); + + if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) { + // Shift Left + uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); + Mask = Mask.shl(ShiftAmt); + getUsefulBits(Op, Mask, Depth + 1); + Mask = Mask.lshr(ShiftAmt); + } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) { + // Shift Right + // We do not handle AArch64_AM::ASR, because the sign will change the + // number of useful bits + uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); + Mask = Mask.lshr(ShiftAmt); + getUsefulBits(Op, Mask, Depth + 1); + Mask = Mask.shl(ShiftAmt); + } else + return; + + UsefulBits &= Mask; +} + +static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue(); + uint64_t MSB = + cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue(); + + if (Op.getOperand(1) == Orig) + return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); + + APInt OpUsefulBits(UsefulBits); + OpUsefulBits = 1; + + if (MSB >= Imm) { + OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); + --OpUsefulBits; + UsefulBits &= ~OpUsefulBits; + getUsefulBits(Op, UsefulBits, Depth + 1); + } else { + OpUsefulBits = OpUsefulBits.shl(MSB + 1); + --OpUsefulBits; + UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm)); + getUsefulBits(Op, UsefulBits, Depth + 1); + } +} + +static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, + SDValue Orig, unsigned Depth) { + + // Users of this node should have already been instruction selected + // FIXME: Can we turn that into an assert? + if (!UserNode->isMachineOpcode()) + return; + + switch (UserNode->getMachineOpcode()) { + default: + return; + case AArch64::ANDSWri: + case AArch64::ANDSXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + // We increment Depth only when we call the getUsefulBits + return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits, + Depth); + case AArch64::UBFMWri: + case AArch64::UBFMXri: + return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth); + + case AArch64::ORRWrs: + case AArch64::ORRXrs: + if (UserNode->getOperand(1) != Orig) + return; + return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits, + Depth); + case AArch64::BFMWri: + case AArch64::BFMXri: + return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth); + } +} + +static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { + if (Depth >= 6) + return; + // Initialize UsefulBits + if (!Depth) { + unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits(); + // At the beginning, assume every produced bits is useful + UsefulBits = APInt(Bitwidth, 0); + UsefulBits.flipAllBits(); + } + APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0); + + for (SDNode *Node : Op.getNode()->uses()) { + // A use cannot produce useful bits + APInt UsefulBitsForUse = APInt(UsefulBits); + getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth); + UsersUsefulBits |= UsefulBitsForUse; + } + // UsefulBits contains the produced bits that are meaningful for the + // current definition, thus a user cannot make a bit meaningful at + // this point + UsefulBits &= UsersUsefulBits; +} + +/// Create a machine node performing a notional SHL of Op by ShlAmount. If +/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is +/// 0, return Op unchanged. +static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { + if (ShlAmount == 0) + return Op; + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned BitWidth = VT.getSizeInBits(); + unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri; + + SDNode *ShiftNode; + if (ShlAmount > 0) { + // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, dl, VT, Op, + CurDAG->getTargetConstant(BitWidth - ShlAmount, dl, VT), + CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, dl, VT)); + } else { + // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1 + assert(ShlAmount < 0 && "expected right shift"); + int ShrAmount = -ShlAmount; + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, dl, VT, Op, CurDAG->getTargetConstant(ShrAmount, dl, VT), + CurDAG->getTargetConstant(BitWidth - 1, dl, VT)); + } + + return SDValue(ShiftNode, 0); +} + +/// Does this tree qualify as an attempt to move a bitfield into position, +/// essentially "(and (shl VAL, N), Mask)". +static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + bool BiggerPattern, + SDValue &Src, int &ShiftAmount, + int &MaskWidth) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + (void)BitWidth; + assert(BitWidth == 32 || BitWidth == 64); + + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(Op, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value + uint64_t NonZeroBits = (~KnownZero).getZExtValue(); + + // Discard a constant AND mask if present. It's safe because the node will + // already have been factored into the computeKnownBits calculation above. + uint64_t AndImm; + if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) { + assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0); + Op = Op.getOperand(0); + } + + // Don't match if the SHL has more than one use, since then we'll end up + // generating SHL+UBFIZ instead of just keeping SHL+AND. + if (!BiggerPattern && !Op.hasOneUse()) + return false; + + uint64_t ShlImm; + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) + return false; + Op = Op.getOperand(0); + + if (!isShiftedMask_64(NonZeroBits)) + return false; + + ShiftAmount = countTrailingZeros(NonZeroBits); + MaskWidth = countTrailingOnes(NonZeroBits >> ShiftAmount); + + // BFI encompasses sufficiently many nodes that it's worth inserting an extra + // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL + // amount. BiggerPattern is true when this pattern is being matched for BFI, + // BiggerPattern is false when this pattern is being matched for UBFIZ, in + // which case it is not profitable to insert an extra shift. + if (ShlImm - ShiftAmount != 0 && !BiggerPattern) + return false; + Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); + + return true; +} + +// Given a OR operation, check if we have the following pattern +// ubfm c, b, imm, imm2 (or something that does the same jobs, see +// isBitfieldExtractOp) +// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and +// countTrailingZeros(mask2) == imm2 - imm + 1 +// f = d | c +// if yes, given reference arguments will be update so that one can replace +// the OR instruction with: +// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 +static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, + SDValue &Src, unsigned &ImmR, + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { + assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); + + // Set Opc + EVT VT = N->getValueType(0); + if (VT == MVT::i32) + Opc = AArch64::BFMWri; + else if (VT == MVT::i64) + Opc = AArch64::BFMXri; + else + return false; + + // Because of simplify-demanded-bits in DAGCombine, involved masks may not + // have the expected shape. Try to undo that. + + unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); + unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); + + // OR is commutative, check all combinations of operand order and values of + // BiggerPattern, i.e. + // Opd0, Opd1, BiggerPattern=false + // Opd1, Opd0, BiggerPattern=false + // Opd0, Opd1, BiggerPattern=true + // Opd1, Opd0, BiggerPattern=true + // Several of these combinations may match, so check with BiggerPattern=false + // first since that will produce better results by matching more instructions + // and/or inserting fewer extra instructions. + for (int I = 0; I < 4; ++I) { + + bool BiggerPattern = I / 2; + SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd1Val = N->getOperand((I + 1) % 2); + SDNode *OrOpd1 = OrOpd1Val.getNode(); + + unsigned BFXOpc; + int DstLSB, Width; + if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, + NumberOfIgnoredLowBits, BiggerPattern)) { + // Check that the returned opcode is compatible with the pattern, + // i.e., same type and zero extended (U and not S) + if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) || + (BFXOpc != AArch64::UBFMWri && VT == MVT::i32)) + continue; + + // Compute the width of the bitfield insertion + DstLSB = 0; + Width = ImmS - ImmR + 1; + // FIXME: This constraint is to catch bitfield insertion we may + // want to widen the pattern if we want to grab general bitfied + // move case + if (Width <= 0) + continue; + + // If the mask on the insertee is correct, we have a BFXIL operation. We + // can share the ImmR and ImmS values from the already-computed UBFM. + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + BiggerPattern, + Src, DstLSB, Width)) { + ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + ImmS = Width - 1; + } else + continue; + + // Check the second part of the pattern + EVT VT = OrOpd1->getValueType(0); + assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand"); + + // Compute the Known Zero for the candidate of the first operand. + // This allows to catch more general case than just looking for + // AND with imm. Indeed, simplify-demanded-bits may have removed + // the AND instruction because it proves it was useless. + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne); + + // Check if there is enough room for the second operand to appear + // in the first one + APInt BitsToBeInserted = + APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width); + + if ((BitsToBeInserted & ~KnownZero) != 0) + continue; + + // Set the first operand + uint64_t Imm; + if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) && + isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT)) + // In that case, we can eliminate the AND + Dst = OrOpd1->getOperand(0); + else + // Maybe the AND has been removed by simplify-demanded-bits + // or is useful because it discards more bits + Dst = OrOpd1Val; + + // both parts match + return true; + } + + return false; +} + +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { + if (N->getOpcode() != ISD::OR) + return nullptr; + + unsigned Opc; + unsigned LSB, MSB; + SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); + + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) + return nullptr; + + SDLoc dl(N); + SDValue Ops[] = { Opd0, + Opd1, + CurDAG->getTargetConstant(LSB, dl, VT), + CurDAG->getTargetConstant(MSB, dl, VT) }; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); +} + +/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the +/// equivalent of a left shift by a constant amount followed by an and masking +/// out a contiguous set of bits. +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { + if (N->getOpcode() != ISD::AND) + return nullptr; + + EVT VT = N->getValueType(0); + unsigned Opc; + if (VT == MVT::i32) + Opc = AArch64::UBFMWri; + else if (VT == MVT::i64) + Opc = AArch64::UBFMXri; + else + return nullptr; + + SDValue Op0; + int DstLSB, Width; + if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, + Op0, DstLSB, Width)) + return nullptr; + + // ImmR is the rotate right amount. + unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + // ImmS is the most significant bit of the source to be moved. + unsigned ImmS = Width - 1; + + SDLoc DL(N); + SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); +} + +bool +AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, + unsigned RegWidth) { + APFloat FVal(0.0); + if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N)) + FVal = CN->getValueAPF(); + else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) { + // Some otherwise illegal constants are allowed in this case. + if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow || + !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1))) + return false; + + ConstantPoolSDNode *CN = + dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)); + FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF(); + } else + return false; + + // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits + // is between 1 and 32 for a destination w-register, or 1 and 64 for an + // x-register. + // + // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we + // want THIS_NODE to be 2^fbits. This is much easier to deal with using + // integers. + bool IsExact; + + // fbits is between 1 and 64 in the worst-case, which means the fmul + // could have 2^64 as an actual operand. Need 65 bits of precision. + APSInt IntVal(65, true); + FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact); + + // N.b. isPowerOf2 also checks for > 0. + if (!IsExact || !IntVal.isPowerOf2()) return false; + unsigned FBits = IntVal.logBase2(); + + // Checks above should have guaranteed that we haven't lost information in + // finding FBits, but it must still be in range. + if (FBits == 0 || FBits > RegWidth) return false; + + FixedPos = CurDAG->getTargetConstant(FBits, SDLoc(N), MVT::i32); + return true; +} + +// Inspects a register string of the form o0:op1:CRn:CRm:op2 gets the fields +// of the string and obtains the integer values from them and combines these +// into a single value to be used in the MRS/MSR instruction. +static int getIntOperandFromRegisterString(StringRef RegString) { + SmallVector<StringRef, 5> Fields; + RegString.split(Fields, ':'); + + if (Fields.size() == 1) + return -1; + + assert(Fields.size() == 5 + && "Invalid number of fields in read register string"); + + SmallVector<int, 5> Ops; + bool AllIntFields = true; + + for (StringRef Field : Fields) { + unsigned IntField; + AllIntFields &= !Field.getAsInteger(10, IntField); + Ops.push_back(IntField); + } + + assert(AllIntFields && + "Unexpected non-integer value in special register string."); + + // Need to combine the integer fields of the string into a single value + // based on the bit encoding of MRS/MSR instruction. + return (Ops[0] << 14) | (Ops[1] << 11) | (Ops[2] << 7) | + (Ops[3] << 3) | (Ops[4]); +} + +// Lower the read_register intrinsic to an MRS instruction node if the special +// register string argument is either of the form detailed in the ALCE (the +// form described in getIntOperandsFromRegsterString) or is a named register +// known by the MRS SysReg mapper. +SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) { + const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); + const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); + SDLoc DL(N); + + int Reg = getIntOperandFromRegisterString(RegString->getString()); + if (Reg != -1) + return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), + MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0)); + + // Use the sysreg mapper to map the remaining possible strings to the + // value for the register to be used for the instruction operand. + AArch64SysReg::MRSMapper mapper; + bool IsValidSpecialReg; + Reg = mapper.fromString(RegString->getString(), + Subtarget->getFeatureBits(), + IsValidSpecialReg); + if (IsValidSpecialReg) + return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0), + MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(0)); + + return nullptr; +} + +// Lower the write_register intrinsic to an MSR instruction node if the special +// register string argument is either of the form detailed in the ALCE (the +// form described in getIntOperandsFromRegsterString) or is a named register +// known by the MSR SysReg mapper. +SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { + const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1)); + const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0)); + SDLoc DL(N); + + int Reg = getIntOperandFromRegisterString(RegString->getString()); + if (Reg != -1) + return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(2), N->getOperand(0)); + + // Check if the register was one of those allowed as the pstatefield value in + // the MSR (immediate) instruction. To accept the values allowed in the + // pstatefield for the MSR (immediate) instruction, we also require that an + // immediate value has been provided as an argument, we know that this is + // the case as it has been ensured by semantic checking. + AArch64PState::PStateMapper PMapper; + bool IsValidSpecialReg; + Reg = PMapper.fromString(RegString->getString(), + Subtarget->getFeatureBits(), + IsValidSpecialReg); + if (IsValidSpecialReg) { + assert (isa<ConstantSDNode>(N->getOperand(2)) + && "Expected a constant integer expression."); + uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned State; + if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { + assert(Immed < 2 && "Bad imm"); + State = AArch64::MSRpstateImm1; + } else { + assert(Immed < 16 && "Bad imm"); + State = AArch64::MSRpstateImm4; + } + return CurDAG->getMachineNode(State, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + CurDAG->getTargetConstant(Immed, DL, MVT::i16), + N->getOperand(0)); + } + + // Use the sysreg mapper to attempt to map the remaining possible strings + // to the value for the register to be used for the MSR (register) + // instruction operand. + AArch64SysReg::MSRMapper Mapper; + Reg = Mapper.fromString(RegString->getString(), + Subtarget->getFeatureBits(), + IsValidSpecialReg); + + if (IsValidSpecialReg) + return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other, + CurDAG->getTargetConstant(Reg, DL, MVT::i32), + N->getOperand(2), N->getOperand(0)); + + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { + // Dump information about the Node being selected + DEBUG(errs() << "Selecting: "); + DEBUG(Node->dump(CurDAG)); + DEBUG(errs() << "\n"); + + // If we have a custom node, we already have selected! + if (Node->isMachineOpcode()) { + DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); + Node->setNodeId(-1); + return nullptr; + } + + // Few custom selection stuff. + SDNode *ResNode = nullptr; + EVT VT = Node->getValueType(0); + + switch (Node->getOpcode()) { + default: + break; + + case ISD::READ_REGISTER: + if (SDNode *Res = SelectReadRegister(Node)) + return Res; + break; + + case ISD::WRITE_REGISTER: + if (SDNode *Res = SelectWriteRegister(Node)) + return Res; + break; + + case ISD::ADD: + if (SDNode *I = SelectMLAV64LaneV128(Node)) + return I; + break; + + case ISD::LOAD: { + // Try to select as an indexed load. Fall through to normal processing + // if we can't. + bool Done = false; + SDNode *I = SelectIndexedLoad(Node, Done); + if (Done) + return I; + break; + } + + case ISD::SRL: + case ISD::AND: + case ISD::SRA: + if (SDNode *I = SelectBitfieldExtractOp(Node)) + return I; + if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) + return I; + break; + + case ISD::OR: + if (SDNode *I = SelectBitfieldInsertOp(Node)) + return I; + break; + + case ISD::EXTRACT_VECTOR_ELT: { + // Extracting lane zero is a special case where we can just use a plain + // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for + // the rest of the compiler, especially the register allocator and copyi + // propagation, to reason about, so is preferred when it's possible to + // use it. + ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1)); + // Bail and use the default Select() for non-zero lanes. + if (LaneNode->getZExtValue() != 0) + break; + // If the element type is not the same as the result type, likewise + // bail and use the default Select(), as there's more to do than just + // a cross-class COPY. This catches extracts of i8 and i16 elements + // since they will need an explicit zext. + if (VT != Node->getOperand(0).getValueType().getVectorElementType()) + break; + unsigned SubReg; + switch (Node->getOperand(0) + .getValueType() + .getVectorElementType() + .getSizeInBits()) { + default: + llvm_unreachable("Unexpected vector element type!"); + case 64: + SubReg = AArch64::dsub; + break; + case 32: + SubReg = AArch64::ssub; + break; + case 16: + SubReg = AArch64::hsub; + break; + case 8: + llvm_unreachable("unexpected zext-requiring extract element!"); + } + SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT, + Node->getOperand(0)); + DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); + DEBUG(Extract->dumpr(CurDAG)); + DEBUG(dbgs() << "\n"); + return Extract.getNode(); + } + case ISD::Constant: { + // Materialize zero constants as copies from WZR/XZR. This allows + // the coalescer to propagate these into other instructions. + ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node); + if (ConstNode->isNullValue()) { + if (VT == MVT::i32) + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), + AArch64::WZR, MVT::i32).getNode(); + else if (VT == MVT::i64) + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), + AArch64::XZR, MVT::i64).getNode(); + } + break; + } + + case ISD::FrameIndex: { + // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm. + int FI = cast<FrameIndexSDNode>(Node)->getIndex(); + unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); + const TargetLowering *TLI = getTargetLowering(); + SDValue TFI = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + SDLoc DL(Node); + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32), + CurDAG->getTargetConstant(Shifter, DL, MVT::i32) }; + return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_ldaxp: + case Intrinsic::aarch64_ldxp: { + unsigned Op = + IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX; + SDValue MemAddr = Node->getOperand(2); + SDLoc DL(Node); + SDValue Chain = Node->getOperand(0); + + SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64, + MVT::Other, MemAddr, Chain); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand(); + cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1); + return Ld; + } + case Intrinsic::aarch64_stlxp: + case Intrinsic::aarch64_stxp: { + unsigned Op = + IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX; + SDLoc DL(Node); + SDValue Chain = Node->getOperand(0); + SDValue ValLo = Node->getOperand(2); + SDValue ValHi = Node->getOperand(3); + SDValue MemAddr = Node->getOperand(4); + + // Place arguments in the right order. + SDValue Ops[] = {ValLo, ValHi, MemAddr, Chain}; + + SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand(); + cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1); + + return St; + } + case Intrinsic::aarch64_neon_ld1x2: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld1x3: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld1x4: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld3: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld4: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld3r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld4r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 2, AArch64::LD2i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectLoadLane(Node, 2, AArch64::LD2i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 2, AArch64::LD2i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 2, AArch64::LD2i64); + break; + case Intrinsic::aarch64_neon_ld3lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 3, AArch64::LD3i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectLoadLane(Node, 3, AArch64::LD3i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 3, AArch64::LD3i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 3, AArch64::LD3i64); + break; + case Intrinsic::aarch64_neon_ld4lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 4, AArch64::LD4i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectLoadLane(Node, 4, AArch64::LD4i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 4, AArch64::LD4i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 4, AArch64::LD4i64); + break; + } + } break; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_tbl2: + return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two + : AArch64::TBLv16i8Two, + false); + case Intrinsic::aarch64_neon_tbl3: + return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three + : AArch64::TBLv16i8Three, + false); + case Intrinsic::aarch64_neon_tbl4: + return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four + : AArch64::TBLv16i8Four, + false); + case Intrinsic::aarch64_neon_tbx2: + return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two + : AArch64::TBXv16i8Two, + true); + case Intrinsic::aarch64_neon_tbx3: + return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three + : AArch64::TBXv16i8Three, + true); + case Intrinsic::aarch64_neon_tbx4: + return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four + : AArch64::TBXv16i8Four, + true); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node)) + return N; + break; + } + break; + } + case ISD::INTRINSIC_VOID: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + if (Node->getNumOperands() >= 3) + VT = Node->getOperand(2)->getValueType(0); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_st1x2: { + if (VT == MVT::v8i8) + return SelectStore(Node, 2, AArch64::ST1Twov8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 2, AArch64::ST1Twov16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 2, AArch64::ST1Twov4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 2, AArch64::ST1Twov8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 2, AArch64::ST1Twov2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 2, AArch64::ST1Twov4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 2, AArch64::ST1Twov2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 2, AArch64::ST1Twov1d); + break; + } + case Intrinsic::aarch64_neon_st1x3: { + if (VT == MVT::v8i8) + return SelectStore(Node, 3, AArch64::ST1Threev8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 3, AArch64::ST1Threev16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 3, AArch64::ST1Threev4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 3, AArch64::ST1Threev8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 3, AArch64::ST1Threev2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 3, AArch64::ST1Threev4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 3, AArch64::ST1Threev2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 3, AArch64::ST1Threev1d); + break; + } + case Intrinsic::aarch64_neon_st1x4: { + if (VT == MVT::v8i8) + return SelectStore(Node, 4, AArch64::ST1Fourv8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 4, AArch64::ST1Fourv16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 4, AArch64::ST1Fourv4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 4, AArch64::ST1Fourv8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 4, AArch64::ST1Fourv2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 4, AArch64::ST1Fourv4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 4, AArch64::ST1Fourv2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 4, AArch64::ST1Fourv1d); + break; + } + case Intrinsic::aarch64_neon_st2: { + if (VT == MVT::v8i8) + return SelectStore(Node, 2, AArch64::ST2Twov8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 2, AArch64::ST2Twov16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 2, AArch64::ST2Twov4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 2, AArch64::ST2Twov8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 2, AArch64::ST2Twov2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 2, AArch64::ST2Twov4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 2, AArch64::ST2Twov2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 2, AArch64::ST1Twov1d); + break; + } + case Intrinsic::aarch64_neon_st3: { + if (VT == MVT::v8i8) + return SelectStore(Node, 3, AArch64::ST3Threev8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 3, AArch64::ST3Threev16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 3, AArch64::ST3Threev4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 3, AArch64::ST3Threev8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 3, AArch64::ST3Threev2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 3, AArch64::ST3Threev4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 3, AArch64::ST3Threev2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 3, AArch64::ST1Threev1d); + break; + } + case Intrinsic::aarch64_neon_st4: { + if (VT == MVT::v8i8) + return SelectStore(Node, 4, AArch64::ST4Fourv8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 4, AArch64::ST4Fourv16b); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectStore(Node, 4, AArch64::ST4Fourv4h); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectStore(Node, 4, AArch64::ST4Fourv8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 4, AArch64::ST4Fourv2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 4, AArch64::ST4Fourv4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 4, AArch64::ST4Fourv2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 4, AArch64::ST1Fourv1d); + break; + } + case Intrinsic::aarch64_neon_st2lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 2, AArch64::ST2i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectStoreLane(Node, 2, AArch64::ST2i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 2, AArch64::ST2i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 2, AArch64::ST2i64); + break; + } + case Intrinsic::aarch64_neon_st3lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 3, AArch64::ST3i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectStoreLane(Node, 3, AArch64::ST3i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 3, AArch64::ST3i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 3, AArch64::ST3i64); + break; + } + case Intrinsic::aarch64_neon_st4lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 4, AArch64::ST4i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectStoreLane(Node, 4, AArch64::ST4i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 4, AArch64::ST4i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 4, AArch64::ST4i64); + break; + } + } + break; + } + case AArch64ISD::LD2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD2DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD3DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD4DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + break; + } + case AArch64ISD::LD2LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + break; + } + case AArch64ISD::LD3LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + break; + } + case AArch64ISD::LD4LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + break; + } + case AArch64ISD::ST2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + break; + } + case AArch64ISD::ST3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + break; + } + case AArch64ISD::ST4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + break; + } + case AArch64ISD::ST1x2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + break; + } + case AArch64ISD::ST1x3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + break; + } + case AArch64ISD::ST1x4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); + else if (VT == MVT::v4i16 || VT == MVT::v4f16) + return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); + else if (VT == MVT::v8i16 || VT == MVT::v8f16) + return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + break; + } + case AArch64ISD::ST2LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + break; + } + case AArch64ISD::ST3LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + break; + } + case AArch64ISD::ST4LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) + return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + break; + } + } + + // Select the default instruction + ResNode = SelectCode(Node); + + DEBUG(errs() << "=> "); + if (ResNode == nullptr || ResNode == Node) + DEBUG(Node->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DEBUG(errs() << "\n"); + + return ResNode; +} + +/// createAArch64ISelDag - This pass converts a legalized DAG into a +/// AArch64-specific DAG, ready for instruction scheduling. +FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new AArch64DAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp new file mode 100644 index 0000000..9f5beff --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -0,0 +1,10064 @@ +//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64TargetLowering class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64ISelLowering.h" +#include "AArch64CallingConvention.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64PerfectShuffle.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "AArch64TargetObjectFile.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lower" + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumShiftInserts, "Number of vector shift inserts"); + +// Place holder until extr generation is tested fully. +static cl::opt<bool> +EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, + cl::desc("Allow AArch64 (or (shift)(shift))->extract"), + cl::init(true)); + +static cl::opt<bool> +EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, + cl::desc("Allow AArch64 SLI/SRI formation"), + cl::init(false)); + +// FIXME: The necessary dtprel relocations don't seem to be supported +// well in the GNU bfd and gold linkers at the moment. Therefore, by +// default, for now, fall back to GeneralDynamic code generation. +cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( + "aarch64-elf-ldtls-generation", cl::Hidden, + cl::desc("Allow AArch64 Local Dynamic TLS code generation"), + cl::init(false)); + +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; + +AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + + // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so + // we have to make something up. Arbitrarily, choose ZeroOrOne. + setBooleanContents(ZeroOrOneBooleanContent); + // When comparing vectors the result sets the different elements in the + // vector to all-one or all-zero. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // Set up the register classes. + addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); + addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); + + if (Subtarget->hasFPARMv8()) { + addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + } + + if (Subtarget->hasNEON()) { + addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); + addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); + // Someone set us up the NEON. + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + addDRTypeForNEON(MVT::v1f64); + addDRTypeForNEON(MVT::v4f16); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + addQRTypeForNEON(MVT::v8f16); + } + + // Compute derived properties from the register classes + computeRegisterProperties(Subtarget->getRegisterInfo()); + + // Provide all sorts of operation actions + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::i32, Custom); + setOperationAction(ISD::SETCC, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::f32, Custom); + setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::i64, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + + setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f80, Expand); + + // Custom lowering hooks are needed for XOR + // to fold it into CSINC/CSINV. + setOperationAction(ISD::XOR, MVT::i32, Custom); + setOperationAction(ISD::XOR, MVT::i64, Custom); + + // Virtually no operation on f128 is legal, but LLVM can't expand them when + // there's a valid register class, so we need custom operations in most cases. + setOperationAction(ISD::FABS, MVT::f128, Expand); + setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FNEG, MVT::f128, Expand); + setOperationAction(ISD::FPOW, MVT::f128, Expand); + setOperationAction(ISD::FREM, MVT::f128, Expand); + setOperationAction(ISD::FRINT, MVT::f128, Expand); + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FTRUNC, MVT::f128, Expand); + setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::BR_CC, MVT::f128, Custom); + setOperationAction(ISD::SELECT, MVT::f128, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + + // Lowering for many of the conversions is actually specified by the non-f128 + // type. The LowerXXX function will be trivial when f128 isn't involved. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + + // Variable arguments. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VACOPY, MVT::Other, Custom); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + + // Variable-sized objects. + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + // Constant pool entries + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + + // BlockAddress + setOperationAction(ISD::BlockAddress, MVT::i64, Custom); + + // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. + setOperationAction(ISD::ADDC, MVT::i32, Custom); + setOperationAction(ISD::ADDE, MVT::i32, Custom); + setOperationAction(ISD::SUBC, MVT::i32, Custom); + setOperationAction(ISD::SUBE, MVT::i32, Custom); + setOperationAction(ISD::ADDC, MVT::i64, Custom); + setOperationAction(ISD::ADDE, MVT::i64, Custom); + setOperationAction(ISD::SUBC, MVT::i64, Custom); + setOperationAction(ISD::SUBE, MVT::i64, Custom); + + // AArch64 lacks both left-rotate and popcount instructions. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } + + // AArch64 doesn't have {U|S}MUL_LOHI. + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + + + // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero + // counterparts, which AArch64 supports directly. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + + setOperationAction(ISD::CTPOP, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i64, Custom); + + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + } + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // Custom lower Add/Sub/Mul with overflow. + setOperationAction(ISD::SADDO, MVT::i32, Custom); + setOperationAction(ISD::SADDO, MVT::i64, Custom); + setOperationAction(ISD::UADDO, MVT::i32, Custom); + setOperationAction(ISD::UADDO, MVT::i64, Custom); + setOperationAction(ISD::SSUBO, MVT::i32, Custom); + setOperationAction(ISD::SSUBO, MVT::i64, Custom); + setOperationAction(ISD::USUBO, MVT::i32, Custom); + setOperationAction(ISD::USUBO, MVT::i64, Custom); + setOperationAction(ISD::SMULO, MVT::i32, Custom); + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i32, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); + + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // f16 is a storage-only type, always promote it to f32. + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FMA, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FCEIL, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FRINT, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FSQRT, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FROUND, MVT::f16, Promote); + setOperationAction(ISD::FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); + + // v4f16 is also a storage-only type, so promote it to v4f32 when that is + // known to be safe. + setOperationAction(ISD::FADD, MVT::v4f16, Promote); + setOperationAction(ISD::FSUB, MVT::v4f16, Promote); + setOperationAction(ISD::FMUL, MVT::v4f16, Promote); + setOperationAction(ISD::FDIV, MVT::v4f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); + AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); + + // Expand all other v4f16 operations. + // FIXME: We could generate better code by promoting some operations to + // a pair of v4f32s + setOperationAction(ISD::FABS, MVT::v4f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); + setOperationAction(ISD::FCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); + setOperationAction(ISD::FMA, MVT::v4f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); + setOperationAction(ISD::FNEG, MVT::v4f16, Expand); + setOperationAction(ISD::FPOW, MVT::v4f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); + setOperationAction(ISD::FREM, MVT::v4f16, Expand); + setOperationAction(ISD::FROUND, MVT::v4f16, Expand); + setOperationAction(ISD::FRINT, MVT::v4f16, Expand); + setOperationAction(ISD::FSIN, MVT::v4f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); + setOperationAction(ISD::SETCC, MVT::v4f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); + + + // v8f16 is also a storage-only type, so expand it. + setOperationAction(ISD::FABS, MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); + setOperationAction(ISD::FCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); + setOperationAction(ISD::FMA, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); + setOperationAction(ISD::FNEG, MVT::v8f16, Expand); + setOperationAction(ISD::FPOW, MVT::v8f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); + setOperationAction(ISD::FREM, MVT::v8f16, Expand); + setOperationAction(ISD::FROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FRINT, MVT::v8f16, Expand); + setOperationAction(ISD::FSIN, MVT::v8f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); + setOperationAction(ISD::SETCC, MVT::v8f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + + // AArch64 has implementations of a lot of rounding-like FP operations. + for (MVT Ty : {MVT::f32, MVT::f64}) { + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + setOperationAction(ISD::FMINNUM, Ty, Legal); + setOperationAction(ISD::FMAXNUM, Ty, Legal); + setOperationAction(ISD::FMINNAN, Ty, Legal); + setOperationAction(ISD::FMAXNAN, Ty, Legal); + } + + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. + // This requires the Performance Monitors extension. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + + if (Subtarget->isTargetMachO()) { + // For iOS, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret to avoid memory + // traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } else { + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + } + + // Make floating-point constants legal for the large code model, so they don't + // become loads from the constant pool. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + } + + // AArch64 does not have floating-point extending loads, i1 sign-extending + // load, floating-point truncating stores, or v2i32->v2i16 truncating store. + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + } + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); + + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f128, MVT::f80, Expand); + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + setTruncStoreAction(MVT::f128, MVT::f16, Expand); + + setOperationAction(ISD::BITCAST, MVT::i16, Custom); + setOperationAction(ISD::BITCAST, MVT::f16, Custom); + + // Indexed loads and stores are supported. + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedLoadAction(im, MVT::i64, Legal); + setIndexedLoadAction(im, MVT::f64, Legal); + setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedLoadAction(im, MVT::f16, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i64, Legal); + setIndexedStoreAction(im, MVT::f64, Legal); + setIndexedStoreAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::f16, Legal); + } + + // Trap. + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // We combine OR nodes for bitfield operations. + setTargetDAGCombine(ISD::OR); + + // Vector add and sub nodes may conceal a high-half opportunity. + // Also, try to fold ADD into CSINC/CSINV.. + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + + setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); + + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::CONCAT_VECTORS); + setTargetDAGCombine(ISD::STORE); + if (Subtarget->supportsAddressTopByteIgnored()) + setTargetDAGCombine(ISD::LOAD); + + setTargetDAGCombine(ISD::MUL); + + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::VSELECT); + + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; + + setStackPointerRegisterToSaveRestore(AArch64::SP); + + setSchedulingPreference(Sched::Hybrid); + + // Enable TBZ/TBNZ + MaskAndBranchFoldingIsLegal = true; + EnableExtLdPromotion = true; + + setMinFunctionAlignment(2); + + setHasExtractBitsInsn(true); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + if (Subtarget->hasNEON()) { + // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to + // silliness like this: + setOperationAction(ISD::FABS, MVT::v1f64, Expand); + setOperationAction(ISD::FADD, MVT::v1f64, Expand); + setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); + setOperationAction(ISD::FCOS, MVT::v1f64, Expand); + setOperationAction(ISD::FDIV, MVT::v1f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); + setOperationAction(ISD::FMA, MVT::v1f64, Expand); + setOperationAction(ISD::FMUL, MVT::v1f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); + setOperationAction(ISD::FNEG, MVT::v1f64, Expand); + setOperationAction(ISD::FPOW, MVT::v1f64, Expand); + setOperationAction(ISD::FREM, MVT::v1f64, Expand); + setOperationAction(ISD::FROUND, MVT::v1f64, Expand); + setOperationAction(ISD::FRINT, MVT::v1f64, Expand); + setOperationAction(ISD::FSIN, MVT::v1f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); + setOperationAction(ISD::FSUB, MVT::v1f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); + setOperationAction(ISD::SETCC, MVT::v1f64, Expand); + setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); + setOperationAction(ISD::SELECT, MVT::v1f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); + + setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); + + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + + // AArch64 doesn't have a direct vector ->f32 conversion instructions for + // elements smaller than i32, so promote the input to i32 first. + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); + // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 + // -> v8f16 conversions. + setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote); + // Similarly, there is no direct i32 -> f64 vector conversion instruction. + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); + // Or, direct i32 -> f16 vector conversion. Set it so custom, so the + // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + + // AArch64 doesn't have MUL.2d: + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + // Custom handling for some quad-vector types to detect MULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + // Likewise, narrowing and extending vector loads/stores aren't handled + // directly. + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + } + + // AArch64 has implementations of a lot of rounding-like FP operations. + for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + } + } + + // Prefer likely predicted branches to selects on out-of-order cores. + if (Subtarget->isCortexA57()) + PredictableSelectIsExpensive = true; +} + +void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { + if (VT == MVT::v2f32 || VT == MVT::v4f16) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); + } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); + } + + // Mark vector float intrinsics as expand. + if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { + setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + + // But we do support custom-lowering for FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); + } + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); + setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); + + setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); + setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); + for (MVT InnerVT : MVT::all_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); + + // CNT supports only B element sizes. + if (VT != MVT::v8i8 && VT != MVT::v16i8) + setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); + + setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + + setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + + // [SU][MIN|MAX] are available for all NEON types apart from i64. + if (!VT.isFloatingPoint() && + VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). + if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, + ISD::FMINNUM, ISD::FMAXNUM}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + + if (Subtarget->isLittleEndian()) { + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT.getSimpleVT(), Legal); + setIndexedStoreAction(im, VT.getSimpleVT(), Legal); + } + } +} + +void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, &AArch64::FPR64RegClass); + addTypeForNEON(VT, MVT::v2i32); +} + +void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, &AArch64::FPR128RegClass); + addTypeForNEON(VT, MVT::v4i32); +} + +EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +/// computeKnownBitsForTargetNode - Determine which of the bits specified in +/// Mask are known to be either zero or one and return them in the +/// KnownZero/KnownOne bitsets. +void AArch64TargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const SelectionDAG &DAG, unsigned Depth) const { + switch (Op.getOpcode()) { + default: + break; + case AArch64ISD::CSEL: { + APInt KnownZero2, KnownOne2; + DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); + KnownZero &= KnownZero2; + KnownOne &= KnownOne2; + break; + } + case ISD::INTRINSIC_W_CHAIN: { + ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); + Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); + switch (IntID) { + default: return; + case Intrinsic::aarch64_ldaxr: + case Intrinsic::aarch64_ldxr: { + unsigned BitWidth = KnownOne.getBitWidth(); + EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); + unsigned MemBits = VT.getScalarType().getSizeInBits(); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + return; + } + } + break; + } + case ISD::INTRINSIC_WO_CHAIN: + case ISD::INTRINSIC_VOID: { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_umaxv: + case Intrinsic::aarch64_neon_uminv: { + // Figure out the datatype of the vector operand. The UMINV instruction + // will zero extend the result, so we can mark as known zero all the + // bits larger than the element datatype. 32-bit or larget doesn't need + // this as those are legal types and will be handled by isel directly. + MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); + unsigned BitWidth = KnownZero.getBitWidth(); + if (VT == MVT::v8i8 || VT == MVT::v16i8) { + assert(BitWidth >= 8 && "Unexpected width!"); + APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); + KnownZero |= Mask; + } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { + assert(BitWidth >= 16 && "Unexpected width!"); + APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); + KnownZero |= Mask; + } + break; + } break; + } + } + } +} + +MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, + EVT) const { + return MVT::i64; +} + +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } + return true; +} + +FastISel * +AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return AArch64::createFastISel(funcInfo, libInfo); +} + +const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((AArch64ISD::NodeType)Opcode) { + case AArch64ISD::FIRST_NUMBER: break; + case AArch64ISD::CALL: return "AArch64ISD::CALL"; + case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; + case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; + case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; + case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; + case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; + case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; + case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; + case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; + case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; + case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; + case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; + case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; + case AArch64ISD::ADC: return "AArch64ISD::ADC"; + case AArch64ISD::SBC: return "AArch64ISD::SBC"; + case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; + case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; + case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; + case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; + case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; + case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; + case AArch64ISD::DUP: return "AArch64ISD::DUP"; + case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; + case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; + case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; + case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; + case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; + case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; + case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; + case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; + case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; + case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; + case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; + case AArch64ISD::BICi: return "AArch64ISD::BICi"; + case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; + case AArch64ISD::BSL: return "AArch64ISD::BSL"; + case AArch64ISD::NEG: return "AArch64ISD::NEG"; + case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; + case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; + case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; + case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; + case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; + case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; + case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; + case AArch64ISD::REV16: return "AArch64ISD::REV16"; + case AArch64ISD::REV32: return "AArch64ISD::REV32"; + case AArch64ISD::REV64: return "AArch64ISD::REV64"; + case AArch64ISD::EXT: return "AArch64ISD::EXT"; + case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; + case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; + case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; + case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; + case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; + case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; + case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; + case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; + case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; + case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; + case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; + case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; + case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; + case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; + case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; + case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; + case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; + case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; + case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; + case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; + case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; + case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; + case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; + case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; + case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; + case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; + case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; + case AArch64ISD::NOT: return "AArch64ISD::NOT"; + case AArch64ISD::BIT: return "AArch64ISD::BIT"; + case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; + case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; + case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; + case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; + case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; + case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; + case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; + case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; + case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; + case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; + case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; + case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; + case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; + case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; + case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; + case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; + case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; + case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; + case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; + case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; + case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; + case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; + case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; + case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; + case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; + case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; + case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; + case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; + case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; + case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; + case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; + case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; + case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; + case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; + case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; + case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; + case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; + case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; + case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; + case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + } + return nullptr; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *MBB) const { + // We materialise the F128CSEL pseudo-instruction as some control flow and a + // phi node: + + // OrigBB: + // [... previous instrs leading to comparison ...] + // b.ne TrueBB + // b EndBB + // TrueBB: + // ; Fallthrough + // EndBB: + // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] + + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction::iterator It = ++MBB->getIterator(); + + unsigned DestReg = MI->getOperand(0).getReg(); + unsigned IfTrueReg = MI->getOperand(1).getReg(); + unsigned IfFalseReg = MI->getOperand(2).getReg(); + unsigned CondCode = MI->getOperand(3).getImm(); + bool NZCVKilled = MI->getOperand(4).isKill(); + + MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, TrueBB); + MF->insert(It, EndBB); + + // Transfer rest of current basic-block to EndBB + EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + EndBB->transferSuccessorsAndUpdatePHIs(MBB); + + BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); + BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + MBB->addSuccessor(TrueBB); + MBB->addSuccessor(EndBB); + + // TrueBB falls through to the end. + TrueBB->addSuccessor(EndBB); + + if (!NZCVKilled) { + TrueBB->addLiveIn(AArch64::NZCV); + EndBB->addLiveIn(AArch64::NZCV); + } + + BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) + .addReg(IfTrueReg) + .addMBB(TrueBB) + .addReg(IfFalseReg) + .addMBB(MBB); + + MI->eraseFromParent(); + return EndBB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + switch (MI->getOpcode()) { + default: +#ifndef NDEBUG + MI->dump(); +#endif + llvm_unreachable("Unexpected instruction for custom inserter!"); + + case AArch64::F128CSEL: + return EmitF128CSEL(MI, BB); + + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return emitPatchPoint(MI, BB); + } +} + +//===----------------------------------------------------------------------===// +// AArch64 Lowering private implementation. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + +/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 +/// CC +static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { + switch (CC) { + default: + llvm_unreachable("Unknown condition code!"); + case ISD::SETNE: + return AArch64CC::NE; + case ISD::SETEQ: + return AArch64CC::EQ; + case ISD::SETGT: + return AArch64CC::GT; + case ISD::SETGE: + return AArch64CC::GE; + case ISD::SETLT: + return AArch64CC::LT; + case ISD::SETLE: + return AArch64CC::LE; + case ISD::SETUGT: + return AArch64CC::HI; + case ISD::SETUGE: + return AArch64CC::HS; + case ISD::SETULT: + return AArch64CC::LO; + case ISD::SETULE: + return AArch64CC::LS; + } +} + +/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. +static void changeFPCCToAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + llvm_unreachable("Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: + CondCode = AArch64CC::EQ; + break; + case ISD::SETGT: + case ISD::SETOGT: + CondCode = AArch64CC::GT; + break; + case ISD::SETGE: + case ISD::SETOGE: + CondCode = AArch64CC::GE; + break; + case ISD::SETOLT: + CondCode = AArch64CC::MI; + break; + case ISD::SETOLE: + CondCode = AArch64CC::LS; + break; + case ISD::SETONE: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GT; + break; + case ISD::SETO: + CondCode = AArch64CC::VC; + break; + case ISD::SETUO: + CondCode = AArch64CC::VS; + break; + case ISD::SETUEQ: + CondCode = AArch64CC::EQ; + CondCode2 = AArch64CC::VS; + break; + case ISD::SETUGT: + CondCode = AArch64CC::HI; + break; + case ISD::SETUGE: + CondCode = AArch64CC::PL; + break; + case ISD::SETLT: + case ISD::SETULT: + CondCode = AArch64CC::LT; + break; + case ISD::SETLE: + case ISD::SETULE: + CondCode = AArch64CC::LE; + break; + case ISD::SETNE: + case ISD::SETUNE: + CondCode = AArch64CC::NE; + break; + } +} + +/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 +/// CC usable with the vector instructions. Fewer operations are available +/// without a real NZCV register, so we have to use less efficient combinations +/// to get the same effect. +static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2, + bool &Invert) { + Invert = false; + switch (CC) { + default: + // Mostly the scalar mappings work fine. + changeFPCCToAArch64CC(CC, CondCode, CondCode2); + break; + case ISD::SETUO: + Invert = true; // Fallthrough + case ISD::SETO: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GE; + break; + case ISD::SETUEQ: + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETUGT: + case ISD::SETUGE: + // All of the compare-mask comparisons are ordered, but we can switch + // between the two by a double inversion. E.g. ULE == !OGT. + Invert = true; + changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); + break; + } +} + +static bool isLegalArithImmed(uint64_t C) { + // Matches AArch64DAGToDAGISel::SelectArithImmed(). + return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); +} + +static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDLoc dl, SelectionDAG &DAG) { + EVT VT = LHS.getValueType(); + + if (VT.isFloatingPoint()) + return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); + + // The CMP instruction is just an alias for SUBS, and representing it as + // SUBS means that it's possible to get CSE with subtract operations. + // A later phase can perform the optimization of setting the destination + // register to WZR/XZR if it ends up being unused. + unsigned Opcode = AArch64ISD::SUBS; + + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on + // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags + // can be set differently by this operation. It comes down to whether + // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then + // everything is fine. If not then the optimization is wrong. Thus general + // comparisons are only valid if op2 != 0. + + // So, finally, the only LLVM-native comparisons that don't mention C and V + // are SETEQ and SETNE. They're the only ones we can safely use CMN for in + // the absence of information about op2. + Opcode = AArch64ISD::ADDS; + RHS = RHS.getOperand(1); + } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && + !isUnsignedIntSetCC(CC)) { + // Similarly, (CMP (and X, Y), 0) can be implemented with a TST + // (a.k.a. ANDS) except that the flags are only guaranteed to work for one + // of the signed comparisons. + Opcode = AArch64ISD::ANDS; + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + } + + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) + .getValue(1); +} + +/// \defgroup AArch64CCMP CMP;CCMP matching +/// +/// These functions deal with the formation of CMP;CCMP;... sequences. +/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of +/// a comparison. They set the NZCV flags to a predefined value if their +/// predicate is false. This allows to express arbitrary conjunctions, for +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// expressed as: +/// cmp A +/// ccmp B, inv(CB), CA +/// check for CB flags +/// +/// In general we can create code for arbitrary "... (and (and A B) C)" +/// sequences. We can also implement some "or" expressions, because "(or A B)" +/// is equivalent to "not (and (not A) (not B))" and we can implement some +/// negation operations: +/// We can negate the results of a single comparison by inverting the flags +/// used when the predicate fails and inverting the flags tested in the next +/// instruction; We can also negate the results of the whole previous +/// conditional compare sequence by inverting the flags tested in the next +/// instruction. However there is no way to negate the result of a partial +/// sequence. +/// +/// Therefore on encountering an "or" expression we can negate the subtree on +/// one side and have to be able to push the negate to the leafs of the subtree +/// on the other side (see also the comments in code). As complete example: +/// "or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// is transformed to +/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// and implemented as: +/// cmp C +/// ccmp D, inv(CD), CC +/// ccmp A, CA, inv(CD) +/// ccmp B, CB, inv(CA) +/// check for CB flags +/// A counterexample is "or (and A B) (and C D)" which cannot be implemented +/// by conditional compare sequences. +/// @{ + +/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +/// CanPushNegate is set to true if we can push a negate operation through +/// the tree in a was that we are left with AND operations and negate operations +/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to +/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be +/// brought into such a form. +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, + unsigned Depth = 0) { + if (!Val.hasOneUse()) + return false; + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + CanPushNegate = true; + return true; + } + // Protect against stack overflow. + if (Depth > 15) + return false; + if (Opcode == ISD::AND || Opcode == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + return false; + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + return false; + // We cannot push a negate through an AND operation (it would become an OR), + // we can however change a (not (or x y)) to (and (not x) (not y)) if we can + // push the negate through the x/y subtrees. + CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + return true; + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// Tries to transform the given i1 producing node @p Val to a series compare +/// and conditional compare operations. @returns an NZCV flags producing node +/// and sets @p OutCC to the flags that should be tested or returns SDValue() if +/// transformation was not possible. +/// On recursive invocations @p PushNegate may be set to true to have negation +/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate +/// for the comparisons in the current subtree; @p Depth limits the search +/// depth to avoid stack overflow. +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool PushNegate = false, + SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, + unsigned Depth = 0) { + // We're at a tree leaf, produce a conditional comparison operation. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (PushNegate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Predicate = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) + return SDValue(); + + assert((Opcode == ISD::OR || !PushNegate) + && "Can only push negate through OR operation"); + + // Check if both sides can be transformed. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) + return SDValue(); + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) + return SDValue(); + + // Do we need to negate our operands? + bool NegateOperands = Opcode == ISD::OR; + // We can negate the results of all previous operations by inverting the + // predicate flags giving us a free negation for one side. For the other side + // we need to be able to push the negation to the leafs of the tree. + if (NegateOperands) { + if (!CanPushNegateL && !CanPushNegateR) + return SDValue(); + // Order the side where we can push the negate through to LHS. + if (!CanPushNegateL && CanPushNegateR) + std::swap(LHS, RHS); + } else { + bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; + bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return SDValue(); + // Order the side where we need to negate the output flags to RHS so it + // gets emitted first. + if (NeedsNegOutL) + std::swap(LHS, RHS); + } + + // Emit RHS. If we want to negate the tree we only need to push a negate + // through if we are already in a PushNegate case, otherwise we can negate + // the "flags to test" afterwards. + AArch64CC::CondCode RHSCC; + SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, + CCOp, Predicate, Depth+1); + if (NegateOperands && !PushNegate) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + // Emit LHS. We must push the negate through if we need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, + CmpR, RHSCC, Depth+1); + // If we transformed an OR to and AND then we have to negate the result + // (or absorb a PushNegate resulting in a double negation). + if (Opcode == ISD::OR && !PushNegate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +/// @} + +static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { + EVT VT = RHS.getValueType(); + uint64_t C = RHSC->getZExtValue(); + if (!isLegalArithImmed(C)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: + break; + case ISD::SETLT: + case ISD::SETGE: + if ((VT == MVT::i32 && C != 0x80000000 && + isLegalArithImmed((uint32_t)(C - 1))) || + (VT == MVT::i64 && C != 0x80000000ULL && + isLegalArithImmed(C - 1ULL))) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; + RHS = DAG.getConstant(C, dl, VT); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if ((VT == MVT::i32 && C != 0 && + isLegalArithImmed((uint32_t)(C - 1))) || + (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; + RHS = DAG.getConstant(C, dl, VT); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if ((VT == MVT::i32 && C != INT32_MAX && + isLegalArithImmed((uint32_t)(C + 1))) || + (VT == MVT::i64 && C != INT64_MAX && + isLegalArithImmed(C + 1ULL))) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; + RHS = DAG.getConstant(C, dl, VT); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if ((VT == MVT::i32 && C != UINT32_MAX && + isLegalArithImmed((uint32_t)(C + 1))) || + (VT == MVT::i64 && C != UINT64_MAX && + isLegalArithImmed(C + 1ULL))) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; + RHS = DAG.getConstant(C, dl, VT); + } + break; + } + } + } + SDValue Cmp; + AArch64CC::CondCode AArch64CC; + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { + const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truly zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && + cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + } + } + + if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { + if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + } + } + + if (!Cmp) { + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + } + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); + return Cmp; +} + +static std::pair<SDValue, SDValue> +getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { + assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && + "Unsupported value type"); + SDValue Value, Overflow; + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + unsigned Opc = 0; + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown overflow instruction!"); + case ISD::SADDO: + Opc = AArch64ISD::ADDS; + CC = AArch64CC::VS; + break; + case ISD::UADDO: + Opc = AArch64ISD::ADDS; + CC = AArch64CC::HS; + break; + case ISD::SSUBO: + Opc = AArch64ISD::SUBS; + CC = AArch64CC::VS; + break; + case ISD::USUBO: + Opc = AArch64ISD::SUBS; + CC = AArch64CC::LO; + break; + // Multiply needs a little bit extra work. + case ISD::SMULO: + case ISD::UMULO: { + CC = AArch64CC::NE; + bool IsSigned = Op.getOpcode() == ISD::SMULO; + if (Op.getValueType() == MVT::i32) { + unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + // For a 32 bit multiply with overflow check we want the instruction + // selector to generate a widening multiply (SMADDL/UMADDL). For that we + // need to generate the following pattern: + // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) + LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); + RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); + SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, + DAG.getConstant(0, DL, MVT::i64)); + // On AArch64 the upper 32 bits are always zero extended for a 32 bit + // operation. We need to clear out the upper 32 bits, because we used a + // widening multiply that wrote all 64 bits. In the end this should be a + // noop. + Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); + if (IsSigned) { + // The signed overflow check requires more than just a simple check for + // any bit set in the upper 32 bits of the result. These bits could be + // just the sign bits of a negative number. To perform the overflow + // check we have to arithmetic shift right the 32nd bit of the result by + // 31 bits. Then we compare the result to the upper 32 bits. + SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, + DAG.getConstant(32, DL, MVT::i64)); + UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); + SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, + DAG.getConstant(31, DL, MVT::i64)); + // It is important that LowerBits is last, otherwise the arithmetic + // shift will not be folded into the compare (SUBS). + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) + .getValue(1); + } else { + // The overflow check for unsigned multiply is easy. We only need to + // check if any of the upper 32 bits are set. This can be done with a + // CMP (shifted register). For that we need to generate the following + // pattern: + // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) + SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, + DAG.getConstant(32, DL, MVT::i64)); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = + DAG.getNode(AArch64ISD::SUBS, DL, VTs, + DAG.getConstant(0, DL, MVT::i64), + UpperBits).getValue(1); + } + break; + } + assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); + // For the 64 bit multiply + Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); + if (IsSigned) { + SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); + SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, + DAG.getConstant(63, DL, MVT::i64)); + // It is important that LowerBits is last, otherwise the arithmetic + // shift will not be folded into the compare (SUBS). + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) + .getValue(1); + } else { + SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = + DAG.getNode(AArch64ISD::SUBS, DL, VTs, + DAG.getConstant(0, DL, MVT::i64), + UpperBits).getValue(1); + } + break; + } + } // switch (...) + + if (Opc) { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); + + // Emit the AArch64 operation with overflow check. + Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); + Overflow = Value.getValue(1); + } + return std::make_pair(Value, Overflow); +} + +SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const { + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; +} + +static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { + SDValue Sel = Op.getOperand(0); + SDValue Other = Op.getOperand(1); + + // If neither operand is a SELECT_CC, give up. + if (Sel.getOpcode() != ISD::SELECT_CC) + std::swap(Sel, Other); + if (Sel.getOpcode() != ISD::SELECT_CC) + return Op; + + // The folding we want to perform is: + // (xor x, (select_cc a, b, cc, 0, -1) ) + // --> + // (csel x, (xor x, -1), cc ...) + // + // The latter will get matched to a CSINV instruction. + + ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); + SDValue LHS = Sel.getOperand(0); + SDValue RHS = Sel.getOperand(1); + SDValue TVal = Sel.getOperand(2); + SDValue FVal = Sel.getOperand(3); + SDLoc dl(Sel); + + // FIXME: This could be generalized to non-integer comparisons. + if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) + return Op; + + ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); + ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); + + // The values aren't constants, this isn't the pattern we're looking for. + if (!CFVal || !CTVal) + return Op; + + // We can commute the SELECT_CC by inverting the condition. This + // might be needed to make this fit into a CSINV pattern. + if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + + // If the constants line up, perform the transform! + if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + + FVal = Other; + TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, + DAG.getConstant(-1ULL, dl, Other.getValueType())); + + return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, + CCVal, Cmp); + } + + return Op; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: + llvm_unreachable("Invalid code"); + case ISD::ADDC: + Opc = AArch64ISD::ADDS; + break; + case ISD::SUBC: + Opc = AArch64ISD::SUBS; + break; + case ISD::ADDE: + Opc = AArch64ISD::ADCS; + ExtraOp = true; + break; + case ISD::SUBE: + Opc = AArch64ISD::SBCS; + ExtraOp = true; + break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2)); +} + +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) + return SDValue(); + + SDLoc dl(Op); + AArch64CC::CondCode CC; + // The actual operation that sets the overflow or carry flag. + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); + + // We use 0 and 1 as false and true values. + SDValue TVal = DAG.getConstant(1, dl, MVT::i32); + SDValue FVal = DAG.getConstant(0, dl, MVT::i32); + + // We use an inverted condition, because the conditional select is inverted + // too. This will allow it to be selected to a single instruction: + // CSINC Wd, WZR, WZR, invert(cond). + SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, + CCVal, Overflow); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); +} + +// Prefetch operands are: +// 1: Address to prefetch +// 2: bool isWrite +// 3: int locality (0 = no locality ... 3 = extreme locality) +// 4: bool isDataCache +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + + bool IsStream = !Locality; + // When the locality number is set + if (Locality) { + // The front-end should have filtered out the out-of-range values + assert(Locality <= 3 && "Prefetch locality out-of-range"); + // The locality degree is the opposite of the cache speed. + // Put the number the other way around. + // The encoding starts at 0 for level 1 + Locality = 3 - Locality; + } + + // built the mask value encoding the expected behavior. + unsigned PrfOp = (IsWrite << 4) | // Load/Store bit + (!IsData << 3) | // IsDataCache bit + (Locality << 1) | // Cache level bits + (unsigned)IsStream; // Stream bit + return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), + DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); + + RTLIB::Libcall LC; + LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); + + return LowerF128Call(Op, DAG, LC); +} + +SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType() != MVT::f128) { + // It's legal except when f128 is involved + return Op; + } + + RTLIB::Libcall LC; + LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + + // FP_ROUND node has a second operand indicating whether it is known to be + // precise. That doesn't take part in the LibCall so we can't directly use + // LowerF128Call. + SDValue SrcVal = Op.getOperand(0); + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; +} + +static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { + // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. + // Any additional optimization in this function should be recorded + // in the cost tables. + EVT InVT = Op.getOperand(0).getValueType(); + EVT VT = Op.getValueType(); + unsigned NumElts = InVT.getVectorNumElements(); + + // f16 vectors are promoted to f32 before a conversion. + if (InVT.getVectorElementType() == MVT::f16) { + MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } + + if (VT.getSizeInBits() < InVT.getSizeInBits()) { + SDLoc dl(Op); + SDValue Cv = + DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), + Op.getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); + } + + if (VT.getSizeInBits() > InVT.getSizeInBits()) { + SDLoc dl(Op); + MVT ExtVT = + MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); + return DAG.getNode(Op.getOpcode(), dl, VT, Ext); + } + + // Type changing conversions are illegal. + return Op; +} + +SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType().isVector()) + return LowerVectorFP_TO_INT(Op, DAG); + + // f16 conversions are promoted to f32. + if (Op.getOperand(0).getValueType() == MVT::f16) { + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); + } + + if (Op.getOperand(0).getValueType() != MVT::f128) { + // It's legal except when f128 is involved + return Op; + } + + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); + else + LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; +} + +static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. + // Any additional optimization in this function should be recorded + // in the cost tables. + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + EVT InVT = In.getValueType(); + + if (VT.getSizeInBits() < InVT.getSizeInBits()) { + MVT CastVT = + MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), + InVT.getVectorNumElements()); + In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); + return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); + } + + if (VT.getSizeInBits() > InVT.getSizeInBits()) { + unsigned CastOpc = + Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + EVT CastVT = VT.changeVectorElementTypeToInteger(); + In = DAG.getNode(CastOpc, dl, CastVT, In); + return DAG.getNode(Op.getOpcode(), dl, VT, In); + } + + return Op; +} + +SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorINT_TO_FP(Op, DAG); + + // f16 conversions are promoted to f32. + if (Op.getValueType() == MVT::f16) { + SDLoc dl(Op); + return DAG.getNode( + ISD::FP_ROUND, dl, MVT::f16, + DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), + DAG.getIntPtrConstant(0, dl)); + } + + // i128 conversions are libcalls. + if (Op.getOperand(0).getValueType() == MVT::i128) + return SDValue(); + + // Other conversions are legal, unless it's to the completely software-based + // fp128. + if (Op.getValueType() != MVT::f128) + return Op; + + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::SINT_TO_FP) + LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + else + LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + + return LowerF128Call(Op, DAG, LC); +} + +SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, + SelectionDAG &DAG) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = + DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); + + StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) + .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); + + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + return CallResult.first; +} + +static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() != MVT::f16) + return SDValue(); + + assert(Op.getOperand(0).getValueType() == MVT::i16); + SDLoc DL(Op); + + Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); + Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); + return SDValue( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, + DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), + 0); +} + +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + +static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + EVT NewVT = getExtensionTo64Bits(OrigTy); + + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); +} + +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + EVT VT = N->getValueType(0); + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (const SDValue &Elt : N->op_values()) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + if (!isIntN(HalfSize, C->getSExtValue())) + return false; + } else { + if (!isUIntN(HalfSize, C->getZExtValue())) + return false; + } + continue; + } + return false; + } + + return true; +} + +static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + SDLoc dl(N); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::getVectorVT(TruncVT, NumElts), Ops); +} + +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) + NewOpc = AArch64ISD::SMULL; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = AArch64ISD::UMULL; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = AArch64ISD::SMULL; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } + + // Legalize to a S/UMULL instruction + SDLoc DL(Op); + SDValue Op0; + SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); + if (!isMLA) { + Op0 = skipExtensionForVectorMULL(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during + // isel lowering to take advantage of no-stall back to back s/umul + s/umla. + // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 + SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); +} + +SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::aarch64_neon_smax: + return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umax: + return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_smin: + return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umin: + return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } +} + +SDValue AArch64TargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + llvm_unreachable("unimplemented operand"); + return SDValue(); + case ISD::BITCAST: + return LowerBITCAST(Op, DAG); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: + return LowerGlobalTLSAddress(Op, DAG); + case ISD::SETCC: + return LowerSETCC(Op, DAG); + case ISD::BR_CC: + return LowerBR_CC(Op, DAG); + case ISD::SELECT: + return LowerSELECT(Op, DAG); + case ISD::SELECT_CC: + return LowerSELECT_CC(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); + case ISD::ConstantPool: + return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: + return LowerBlockAddress(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG); + case ISD::VACOPY: + return LowerVACOPY(Op, DAG); + case ISD::VAARG: + return LowerVAARG(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: + return LowerXALUO(Op, DAG); + case ISD::FADD: + return LowerF128Call(Op, DAG, RTLIB::ADD_F128); + case ISD::FSUB: + return LowerF128Call(Op, DAG, RTLIB::SUB_F128); + case ISD::FMUL: + return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FDIV: + return LowerF128Call(Op, DAG, RTLIB::DIV_F128); + case ISD::FP_ROUND: + return LowerFP_ROUND(Op, DAG); + case ISD::FP_EXTEND: + return LowerFP_EXTEND(Op, DAG); + case ISD::FRAMEADDR: + return LowerFRAMEADDR(Op, DAG); + case ISD::RETURNADDR: + return LowerRETURNADDR(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: + return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: + return LowerVectorSRA_SRL_SHL(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRL_PARTS: + case ISD::SRA_PARTS: + return LowerShiftRightParts(Op, DAG); + case ISD::CTPOP: + return LowerCTPOP(Op, DAG); + case ISD::FCOPYSIGN: + return LowerFCOPYSIGN(Op, DAG); + case ISD::AND: + return LowerVectorAND(Op, DAG); + case ISD::OR: + return LowerVectorOR(Op, DAG); + case ISD::XOR: + return LowerXOR(Op, DAG); + case ISD::PREFETCH: + return LowerPREFETCH(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); + case ISD::FSINCOS: + return LowerFSINCOS(Op, DAG); + case ISD::MUL: + return LowerMUL(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); + } +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "AArch64GenCallingConv.inc" + +/// Selects the correct CCAssignFn for a given CallingConvention value. +CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention."); + case CallingConv::WebKit_JS: + return CC_AArch64_WebKit_JS; + case CallingConv::GHC: + return CC_AArch64_GHC; + case CallingConv::C: + case CallingConv::Fast: + if (!Subtarget->isTargetDarwin()) + return CC_AArch64_AAPCS; + return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + } +} + +SDValue AArch64TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // At this point, Ins[].VT may already be promoted to i32. To correctly + // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and + // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. + // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here + // we use a special version of AnalyzeFormalArguments to pass in ValVT and + // LocVT. + unsigned NumArgs = Ins.size(); + Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ValVT = Ins[i].VT; + if (Ins[i].isOrigArg()) { + std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); + CurArgIdx = Ins[i].getOrigArgIndex(); + + // Get type of the original argument. + EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), + /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ValVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ValVT = MVT::i16; + } + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + assert(ArgLocs.size() == Ins.size()); + SmallVector<SDValue, 16> ArgValues; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + if (Ins[i].Flags.isByVal()) { + // Byval is used for HFAs in the PCS, but the system should work in a + // non-compliant manner for larger structs. + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + int Size = Ins[i].Flags.getByValSize(); + unsigned NumRegs = (Size + 7) / 8; + + // FIXME: This works on big-endian for composite byvals, which are the common + // case. It should also work for fundamental types too. + unsigned FrameIdx = + MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); + SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); + InVals.push_back(FrameIdxN); + + continue; + } + + if (VA.isRegLoc()) { + // Arguments stored in registers. + EVT RegVT = VA.getLocVT(); + + SDValue ArgValue; + const TargetRegisterClass *RC; + + if (RegVT == MVT::i32) + RC = &AArch64::GPR32RegClass; + else if (RegVT == MVT::i64) + RC = &AArch64::GPR64RegClass; + else if (RegVT == MVT::f16) + RC = &AArch64::FPR16RegClass; + else if (RegVT == MVT::f32) + RC = &AArch64::FPR32RegClass; + else if (RegVT == MVT::f64 || RegVT.is64BitVector()) + RC = &AArch64::FPR64RegClass; + else if (RegVT == MVT::f128 || RegVT.is128BitVector()) + RC = &AArch64::FPR128RegClass; + else + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); + + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); + + // If this is an 8, 16 or 32-bit value, it is really passed promoted + // to 64 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); + break; + case CCValAssign::AExt: + case CCValAssign::SExt: + case CCValAssign::ZExt: + // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt + // nodes after our lowering. + assert(RegVT == Ins[i].VT && "incorrect register location selected"); + break; + } + + InVals.push_back(ArgValue); + + } else { // VA.isRegLoc() + assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); + unsigned ArgOffset = VA.getLocMemOffset(); + unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; + + uint32_t BEAlign = 0; + if (!Subtarget->isLittleEndian() && ArgSize < 8 && + !Ins[i].Flags.isInConsecutiveRegs()) + BEAlign = 8 - ArgSize; + + int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + SDValue ArgValue; + + // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) + ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; + MVT MemVT = VA.getValVT(); + + switch (VA.getLocInfo()) { + default: + break; + case CCValAssign::BCvt: + MemVT = VA.getLocVT(); + break; + case CCValAssign::SExt: + ExtType = ISD::SEXTLOAD; + break; + case CCValAssign::ZExt: + ExtType = ISD::ZEXTLOAD; + break; + case CCValAssign::AExt: + ExtType = ISD::EXTLOAD; + break; + } + + ArgValue = DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT, false, false, false, 0); + + InVals.push_back(ArgValue); + } + } + + // varargs + if (isVarArg) { + if (!Subtarget->isTargetDarwin()) { + // The AAPCS variadic function ABI is identical to the non-variadic + // one. As a result there may be more arguments in registers and we should + // save them for future reference. + saveVarArgRegisters(CCInfo, DAG, DL, Chain); + } + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + // This will point to the next argument passed via stack. + unsigned StackOffset = CCInfo.getNextStackOffset(); + // We currently pass all varargs at 8-byte alignment. + StackOffset = ((StackOffset + 7) & ~7); + AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + } + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + unsigned StackArgSize = CCInfo.getNextStackOffset(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { + // This is a non-standard ABI so by fiat I say we're allowed to make full + // use of the stack area to be popped, which must be aligned to 16 bytes in + // any case: + StackArgSize = RoundUpToAlignment(StackArgSize, 16); + + // If we're expected to restore the stack (e.g. fastcc) then we'll be adding + // a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackArgSize); + + // This realignment carries over to the available bytes below. Our own + // callers will guarantee the space is free by giving an aligned value to + // CALLSEQ_START. + } + // Even if we're not expected to free up the space, it's useful to know how + // much is there while considering tail calls (because we can reuse it). + FuncInfo->setBytesInStackArgArea(StackArgSize); + + return Chain; +} + +void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, + SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + SmallVector<SDValue, 8> MemOps; + + static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7 }; + static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); + unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); + + unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); + int GPRIdx = 0; + if (GPRSaveSize != 0) { + GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); + + SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); + + for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, + false, 0); + MemOps.push_back(Store); + FIN = + DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); + } + } + FuncInfo->setVarArgsGPRIndex(GPRIdx); + FuncInfo->setVarArgsGPRSize(GPRSaveSize); + + if (Subtarget->hasFPARMv8()) { + static const MCPhysReg FPRArgRegs[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, + AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; + static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); + unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); + + unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); + int FPRIdx = 0; + if (FPRSaveSize != 0) { + FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); + + SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); + + for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); + + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), + false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, + DAG.getConstant(16, DL, PtrVT)); + } + } + FuncInfo->setVarArgsFPRIndex(FPRIdx); + FuncInfo->setVarArgsFPRSize(FPRSaveSize); + } + + if (!MemOps.empty()) { + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); + } +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue AArch64TargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, bool isThisReturn, + SDValue ThisVal) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + // Pass 'this' value directly from the argument to return value, to avoid + // reg unit interference + if (i == 0 && isThisReturn) { + assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && + "unexpected return calling convention register assignment"); + InVals.push_back(ThisVal); + continue; + } + + SDValue Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +bool AArch64TargetLowering::isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + // For CallingConv::C this function knows whether the ABI needs + // changing. That's not true for other conventions so they will have to opt in + // manually. + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + return false; + + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86) but less efficient and uglier in LowerCall. + for (Function::const_arg_iterator i = CallerF->arg_begin(), + e = CallerF->arg_end(); + i != e; ++i) + if (i->hasByValAttr()) + return false; + + if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CCMatch) + return true; + return false; + } + + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + const Triple &TT = getTargetMachine().getTargetTriple(); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) + return false; + } + + // Now we search for cases where we can use a tail call without changing the + // ABI. Sibcall is used in some places (particularly gcc) to refer to this + // concept. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!isVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + if (isVarArg && !Outs.empty()) { + // At least two cases here: if caller is fastcc then we can't have any + // memory arguments (we'd be expected to clean up the stack afterwards). If + // caller is C then we could potentially use its argument area. + + // FIXME: for now we take the most conservative of these in both cases: + // disallow all variadic memory operands. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) + return false; + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); + + SmallVector<CCValAssign, 16> RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // Nothing more to check if the callee is taking no arguments + if (Outs.empty()) + return true; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); + + const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + + // If the stack arguments for this call would fit into our own save area then + // the call can be made tail. + return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); +} + +SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo *MFI, + int ClobberedFI) const { + SmallVector<SDValue, 8> ArgChains; + int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) + if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, + bool TailCallOpt) const { + return CallCC == CallingConv::Fast && TailCallOpt; +} + +bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { + return CallCC == CallingConv::Fast; +} + +/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, +/// and add input and output parameter nodes. +SDValue +AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &DL = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &IsTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsThisReturn = false; + + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + bool IsSibCall = false; + + if (IsTailCall) { + // Check if it's really possible to do a tail call. + IsTailCall = isEligibleForTailCallOptimization( + Callee, CallConv, IsVarArg, IsStructRet, + MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); + if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + // A sibling call is one where we're under the usual C ABI and not planning + // to change that but can still do a tail call: + if (!TailCallOpt && IsTailCall) + IsSibCall = true; + + if (IsTailCall) + ++NumTailCalls; + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + if (IsVarArg) { + // Handle fixed and variable vector arguments differently. + // Variable vector arguments always go into memory. + unsigned NumArgs = Outs.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ArgVT = Outs[i].VT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, + /*IsVarArg=*/ !Outs[i].IsFixed); + bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + } else { + // At this point, Outs[].VT may already be promoted to i32. To correctly + // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and + // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. + // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here + // we use a special version of AnalyzeCallOperands to pass in ValVT and + // LocVT. + unsigned NumArgs = Outs.size(); + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ValVT = Outs[i].VT; + // Get type of the original argument. + EVT ActualVT = getValueType(DAG.getDataLayout(), + CLI.getArgs()[Outs[i].OrigArgIndex].Ty, + /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + ValVT = MVT::i8; + else if (ActualMVT == MVT::i16) + ValVT = MVT::i16; + + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + } + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + if (IsSibCall) { + // Since we're not changing the ABI to make this a tail call, the memory + // operands are already available in the caller's incoming argument space. + NumBytes = 0; + } + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int FPDiff = 0; + + if (IsTailCall && !IsSibCall) { + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + + // Since callee will pop argument stack as a tail call, we must keep the + // popped size 16-byte aligned. + NumBytes = RoundUpToAlignment(NumBytes, 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // can actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!IsSibCall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, + true), + DL); + + SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, + getPointerTy(DAG.getDataLayout())); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + if (Outs[realArgIdx].ArgVT == MVT::i1) { + // AAPCS requires i1 to be zero-extended to 8-bits by the caller. + Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); + } + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::FPExt: + Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { + assert(VA.getLocVT() == MVT::i64 && + "unexpected calling convention register assignment"); + assert(!Ins.empty() && Ins[0].VT == MVT::i64 && + "unexpected use of 'returned'"); + IsThisReturn = true; + } + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + + SDValue DstAddr; + MachinePointerInfo DstInfo; + + // FIXME: This works on big-endian for composite byvals, which are the + // common case. It should also work for fundamental types too. + uint32_t BEAlign = 0; + unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getValVT().getSizeInBits(); + OpSize = (OpSize + 7) / 8; + if (!Subtarget->isLittleEndian() && !Flags.isByVal() && + !Flags.isInConsecutiveRegs()) { + if (OpSize < 8) + BEAlign = 8 - OpSize; + } + unsigned LocMemOffset = VA.getLocMemOffset(); + int32_t Offset = LocMemOffset + BEAlign; + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); + PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + + if (IsTailCall) { + Offset = Offset + FPDiff; + int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + + DstAddr = DAG.getFrameIndex(FI, PtrVT); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); + + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), + LocMemOffset); + } + + if (Outs[i].Flags.isByVal()) { + SDValue SizeNode = + DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); + SDValue Cpy = DAG.getMemcpy( + Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + /*isVol = */ false, /*AlwaysInline = */ false, + /*isTailCall = */ false, + DstInfo, MachinePointerInfo()); + + MemOpChains.push_back(Cpy); + } else { + // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already + // promoted to a legal register type i32, we should truncate Arg back to + // i1/i8/i16. + if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || + VA.getValVT() == MVT::i16) + Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); + + SDValue Store = + DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); + MemOpChains.push_back(Store); + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Subtarget->isTargetMachO()) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + bool InternalLinkage = GV->hasInternalLinkage(); + if (InternalLinkage) + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); + else { + Callee = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } + } else if (ExternalSymbolSDNode *S = + dyn_cast<ExternalSymbolSDNode>(Callee)) { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); + } + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); + } + + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (IsTailCall && !IsSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), + DAG.getIntPtrConstant(0, DL, true), InFlag, DL); + InFlag = Chain.getValue(1); + } + + std::vector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (IsTailCall) { + // Each tail call may have to adjust the stack by a different amount, so + // this information must travel along with the operation for eventual + // consumption by emitEpilogue. + Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); + } + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (auto &RegToPass : RegsToPass) + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const uint32_t *Mask; + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (IsThisReturn) { + // For 'this' returns, use the X0-preserving mask if applicable + Mask = TRI->getThisReturnPreservedMask(MF, CallConv); + if (!Mask) { + IsThisReturn = false; + Mask = TRI->getCallPreservedMask(MF, CallConv); + } + } else + Mask = TRI->getCallPreservedMask(MF, CallConv); + + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + // If we're doing a tall call, use a TC_RETURN here rather than an + // actual call instruction. + if (IsTailCall) { + MF.getFrameInfo()->setHasTailCall(); + return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + } + + // Returns a chain and a flag for retval copy to use. + Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); + InFlag = Chain.getValue(1); + + uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) + ? RoundUpToAlignment(NumBytes, 16) + : 0; + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), + DAG.getIntPtrConstant(CalleePopBytes, DL, true), + InFlag, DL); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, + InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue()); +} + +bool AArch64TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC); +} + +SDValue +AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC); + + // Copy the result values into the output registers. + SDValue Flag; + SmallVector<SDValue, 4> RetOps(1, Chain); + for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + SDValue Arg = OutVals[realRVLocIdx]; + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + if (Outs[i].ArgVT == MVT::i1) { + // AAPCS requires i1 to be zero-extended to i8 by the producer of the + // value. This is strictly redundant on Darwin (which uses "zeroext + // i1"), but will be optimised out before ISel. + Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + } + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); +} + +//===----------------------------------------------------------------------===// +// Other Lowering Code +//===----------------------------------------------------------------------===// + +SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = GN->getGlobal(); + unsigned char OpFlags = + Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + + assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && + "unexpected offset in global node"); + + // This also catched the large code model case for Darwin. + if ((OpFlags & AArch64II::MO_GOT) != 0) { + SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + } + + if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { + assert(getTargetMachine().getCodeModel() == CodeModel::Small && + "use of MO_CONSTPOOL only supported on small model"); + SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; + SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); + SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + SDValue GlobalAddr = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*isVolatile=*/false, + /*isNonTemporal=*/true, + /*isInvariant=*/true, 8); + if (GN->getOffset() != 0) + return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, + DAG.getConstant(GN->getOffset(), DL, PtrVT)); + return GlobalAddr; + } + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + } else { + // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and + // the only correct model on Darwin. + SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, + OpFlags | AArch64II::MO_PAGE); + unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; + SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); + + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +/// \brief Convert a TLS address reference into the correct sequence of loads +/// and calls to compute the variable's address (for Darwin, currently) and +/// return an SDValue containing the final node. + +/// Darwin only has one TLS scheme which must be capable of dealing with the +/// fully general situation, in the worst case. This means: +/// + "extern __thread" declaration. +/// + Defined in a possibly unknown dynamic library. +/// +/// The general system is that each __thread variable has a [3 x i64] descriptor +/// which contains information used by the runtime to calculate the address. The +/// only part of this the compiler needs to know about is the first xword, which +/// contains a function pointer that must be called with the address of the +/// entire descriptor in "x0". +/// +/// Since this descriptor may be in a different unit, in general even the +/// descriptor must be accessed via an indirect load. The "ideal" code sequence +/// is: +/// adrp x0, _var@TLVPPAGE +/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor +/// ldr x1, [x0] ; x1 contains 1st entry of descriptor, +/// ; the function pointer +/// blr x1 ; Uses descriptor address in x0 +/// ; Address of _var is now in x0. +/// +/// If the address of _var's descriptor *is* known to the linker, then it can +/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for +/// a slight efficiency gain. +SDValue +AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); + + SDLoc DL(Op); + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + + SDValue TLVPAddr = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); + + // The first entry in the descriptor is a function pointer that we must call + // to obtain the address of the variable. + SDValue Chain = DAG.getEntryNode(); + SDValue FuncTLVGet = + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, + true, true, 8); + Chain = FuncTLVGet.getValue(1); + + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + const uint32_t *Mask = + Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); + + // Finally, we can make the call. This is just a degenerate version of a + // normal AArch64 call node: x0 takes the address of the descriptor, and + // returns the address of the variable in this thread. + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); + Chain = + DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); +} + +/// When accessing thread-local variables under either the general-dynamic or +/// local-dynamic system, we make a "TLS-descriptor" call. The variable will +/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry +/// is a function pointer to carry out the resolution. +/// +/// The sequence is: +/// adrp x0, :tlsdesc:var +/// ldr x1, [x0, #:tlsdesc_lo12:var] +/// add x0, x0, #:tlsdesc_lo12:var +/// .tlsdesccall var +/// blr x1 +/// (TPIDR_EL0 offset now in x0) +/// +/// The above sequence must be produced unscheduled, to enable the linker to +/// optimize/relax this sequence. +/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the +/// above sequence, and expanded really late in the compilation flow, to ensure +/// the sequence is produced as per above. +SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + SmallVector<SDValue, 2> Ops; + Ops.push_back(Chain); + Ops.push_back(SymAddr); + + Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); + SDValue Glue = Chain.getValue(1); + + return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); +} + +SDValue +AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetELF() && "This function expects an ELF target"); + assert(getTargetMachine().getCodeModel() == CodeModel::Small && + "ELF TLS only supported in small memory model"); + // Different choices can be made for the maximum size of the TLS area for a + // module. For the small address model, the default TLS size is 16MiB and the + // maximum TLS size is 4GiB. + // FIXME: add -mtls-size command line option and make it control the 16MiB + // vs. 4GiB code sequence generation. + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + + TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { + if (Model == TLSModel::LocalDynamic) + Model = TLSModel::GeneralDynamic; + } + + SDValue TPOff; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + const GlobalValue *GV = GA->getGlobal(); + + SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); + + if (Model == TLSModel::LocalExec) { + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + SDValue TPWithOff_lo = + SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, + HiVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + SDValue TPWithOff = + SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, + LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + return TPWithOff; + } else if (Model == TLSModel::InitialExec) { + TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); + } else if (Model == TLSModel::LocalDynamic) { + // Local-dynamic accesses proceed in two phases. A general-dynamic TLS + // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate + // the beginning of the module's TLS region, followed by a DTPREL offset + // calculation. + + // These accesses will need deduplicating if there's more than one. + AArch64FunctionInfo *MFI = + DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + MFI->incNumLocalDynamicTLSAccesses(); + + // The call needs a relocation too for linker relaxation. It doesn't make + // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of + // the address. + SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, + AArch64II::MO_TLS); + + // Now we can calculate the offset from TPIDR_EL0 to this module's + // thread-local area. + TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); + + // Now use :dtprel_whatever: operations to calculate this variable's offset + // in its thread-storage area. + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, 0, + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + } else if (Model == TLSModel::GeneralDynamic) { + // The call needs a relocation too for linker relaxation. It doesn't make + // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of + // the address. + SDValue SymAddr = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + + // Finally we can make a call to calculate the offset from tpidr_el0. + TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); + } else + llvm_unreachable("Unsupported ELF TLS access model"); + + return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); +} + +SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + if (Subtarget->isTargetDarwin()) + return LowerDarwinGlobalTLSAddress(Op, DAG); + else if (Subtarget->isTargetELF()) + return LowerELFGlobalTLSAddress(Op, DAG); + + llvm_unreachable("Unexpected platform trying to use TLS"); +} +SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + SDLoc dl(Op); + + // Handle f128 first, since lowering it will result in comparing the return + // value of a libcall against zero, which is just what the rest of LowerBR_CC + // is expecting to deal with. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, dl, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch + // instruction. + unsigned Opc = LHS.getOpcode(); + if (LHS.getResNo() == 1 && isOneConstant(RHS) && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && + "Unexpected condition code."); + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + AArch64CC::CondCode OFCC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); + + if (CC == ISD::SETNE) + OFCC = getInvertedCondCode(OFCC); + SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); + + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); + } + + if (LHS.getValueType().isInteger()) { + assert((LHS.getValueType() == RHS.getValueType()) && + (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); + + // If the RHS of the comparison is zero, we can potentially fold this + // to a specialized branch. + const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); + if (RHSC && RHSC->getZExtValue() == 0) { + if (CC == ISD::SETEQ) { + // See if we can use a TBZ to fold in an AND as well. + // TBZ has a smaller branch displacement than CBZ. If the offset is + // out of bounds, a late MI-layer pass rewrites branches. + // 403.gcc is an example that hits this case. + if (LHS.getOpcode() == ISD::AND && + isa<ConstantSDNode>(LHS.getOperand(1)) && + isPowerOf2_64(LHS.getConstantOperandVal(1))) { + SDValue Test = LHS.getOperand(0); + uint64_t Mask = LHS.getConstantOperandVal(1); + return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, + DAG.getConstant(Log2_64(Mask), dl, MVT::i64), + Dest); + } + + return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); + } else if (CC == ISD::SETNE) { + // See if we can use a TBZ to fold in an AND as well. + // TBZ has a smaller branch displacement than CBZ. If the offset is + // out of bounds, a late MI-layer pass rewrites branches. + // 403.gcc is an example that hits this case. + if (LHS.getOpcode() == ISD::AND && + isa<ConstantSDNode>(LHS.getOperand(1)) && + isPowerOf2_64(LHS.getConstantOperandVal(1))) { + SDValue Test = LHS.getOperand(0); + uint64_t Mask = LHS.getConstantOperandVal(1); + return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, + DAG.getConstant(Log2_64(Mask), dl, MVT::i64), + Dest); + } + + return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); + } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { + // Don't combine AND since emitComparison converts the AND to an ANDS + // (a.k.a. TST) and the test in the test bit and branch instruction + // becomes redundant. This would also increase register pressure. + uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, + DAG.getConstant(Mask, dl, MVT::i64), Dest); + } + } + if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && + LHS.getOpcode() != ISD::AND) { + // Don't combine AND since emitComparison converts the AND to an ANDS + // (a.k.a. TST) and the test in the test bit and branch instruction + // becomes redundant. This would also increase register pressure. + uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, + DAG.getConstant(Mask, dl, MVT::i64), Dest); + } + + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two branches to implement. + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); + SDValue BR1 = + DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); + if (CC2 != AArch64CC::AL) { + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, + Cmp); + } + + return BR1; +} + +SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + SDValue In1 = Op.getOperand(0); + SDValue In2 = Op.getOperand(1); + EVT SrcVT = In2.getValueType(); + + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); + + EVT VecVT; + EVT EltVT; + uint64_t EltMask; + SDValue VecVal1, VecVal2; + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { + EltVT = MVT::i32; + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); + EltMask = 0x80000000ULL; + + if (!VT.isVector()) { + VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + DAG.getUNDEF(VecVT), In1); + VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + DAG.getUNDEF(VecVT), In2); + } else { + VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); + VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + } + } else if (VT == MVT::f64 || VT == MVT::v2f64) { + EltVT = MVT::i64; + VecVT = MVT::v2i64; + + // We want to materialize a mask with the high bit set, but the AdvSIMD + // immediate moves cannot materialize that in a single instruction for + // 64-bit elements. Instead, materialize zero and then negate it. + EltMask = 0; + + if (!VT.isVector()) { + VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, + DAG.getUNDEF(VecVT), In1); + VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, + DAG.getUNDEF(VecVT), In2); + } else { + VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); + VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + } + } else { + llvm_unreachable("Invalid type for copysign!"); + } + + SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); + + // If we couldn't materialize the mask above, then the mask vector will be + // the zero vector, and we need to negate it here. + if (VT == MVT::f64 || VT == MVT::v2f64) { + BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); + BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); + BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); + } + + SDValue Sel = + DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + + if (VT == MVT::f32) + return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); + else if (VT == MVT::f64) + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); + else + return DAG.getNode(ISD::BITCAST, DL, VT, Sel); +} + +SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat)) + return SDValue(); + + if (!Subtarget->hasNEON()) + return SDValue(); + + // While there is no integer popcount instruction, it can + // be more efficiently lowered to the following sequence that uses + // AdvSIMD registers/instructions as long as the copies to/from + // the AdvSIMD registers are cheap. + // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd + // CNT V0.8B, V0.8B // 8xbyte pop-counts + // ADDV B0, V0.8B // sum 8xbyte pop-counts + // UMOV X0, V0.B[0] // copy byte result back to integer reg + SDValue Val = Op.getOperand(0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::i32) + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + + if (VT == MVT::i64) + UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); + return UaddLV; +} + +SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + if (Op.getValueType().isVector()) + return LowerVSETCC(Op, DAG); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + SDLoc dl(Op); + + // We chose ZeroOrOneBooleanContents, so use zero and one. + EVT VT = Op.getValueType(); + SDValue TVal = DAG.getConstant(1, dl, VT); + SDValue FVal = DAG.getConstant(0, dl, VT); + + // Handle f128 first, since one possible outcome is a normal integer + // comparison which gets picked up by the next if statement. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, use it. + if (!RHS.getNode()) { + assert(LHS.getValueType() == Op.getValueType() && + "Unexpected setcc expansion!"); + return LHS; + } + } + + if (LHS.getValueType().isInteger()) { + SDValue CCVal; + SDValue Cmp = + getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); + + // Note that we inverted the condition above, so we reverse the order of + // the true and false operands here. This will allow the setcc to be + // matched to a single CSINC instruction. + return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + } + + // Now we know we're dealing with FP values. + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead + // and do the comparison. + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + if (CC2 == AArch64CC::AL) { + changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); + + // Note that we inverted the condition above, so we reverse the order of + // the true and false operands here. This will allow the setcc to be + // matched to a single CSINC instruction. + return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); + } else { + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't + // totally clean. Some of them require two CSELs to implement. As is in + // this case, we emit the first CSEL and then emit a second using the output + // of the first as the RHS. We're effectively OR'ing the two CC's together. + + // FIXME: It would be nice if we could match the two CSELs to two CSINCs. + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); + SDValue CS1 = + DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); + + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + } +} + +SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, + SDValue RHS, SDValue TVal, + SDValue FVal, SDLoc dl, + SelectionDAG &DAG) const { + // Handle f128 first, because it will result in a comparison of some RTLIB + // call result against zero. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, dl, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + // Also handle f16, for which we need to do a f32 comparison. + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + + // Next, handle integers. + if (LHS.getValueType().isInteger()) { + assert((LHS.getValueType() == RHS.getValueType()) && + (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); + + unsigned Opcode = AArch64ISD::CSEL; + + // If both the TVal and the FVal are constants, see if we can swap them in + // order to for a CSINV or CSINC out of them. + ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); + ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); + + if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } else if (TVal.getOpcode() == ISD::XOR) { + // If TVal is a NOT we want to swap TVal and FVal so that we can match + // with a CSINV rather than a CSEL. + if (isAllOnesConstant(TVal.getOperand(1))) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + } else if (TVal.getOpcode() == ISD::SUB) { + // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so + // that we can match with a CSNEG rather than a CSEL. + if (isNullConstant(TVal.getOperand(0))) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + } else if (CTVal && CFVal) { + const int64_t TrueVal = CTVal->getSExtValue(); + const int64_t FalseVal = CFVal->getSExtValue(); + bool Swap = false; + + // If both TVal and FVal are constants, see if FVal is the + // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC + // instead of a CSEL in that case. + if (TrueVal == ~FalseVal) { + Opcode = AArch64ISD::CSINV; + } else if (TrueVal == -FalseVal) { + Opcode = AArch64ISD::CSNEG; + } else if (TVal.getValueType() == MVT::i32) { + // If our operands are only 32-bit wide, make sure we use 32-bit + // arithmetic for the check whether we can use CSINC. This ensures that + // the addition in the check will wrap around properly in case there is + // an overflow (which would not be the case if we do the check with + // 64-bit arithmetic). + const uint32_t TrueVal32 = CTVal->getZExtValue(); + const uint32_t FalseVal32 = CFVal->getZExtValue(); + + if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { + Opcode = AArch64ISD::CSINC; + + if (TrueVal32 > FalseVal32) { + Swap = true; + } + } + // 64-bit check whether we can use CSINC. + } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { + Opcode = AArch64ISD::CSINC; + + if (TrueVal > FalseVal) { + Swap = true; + } + } + + // Swap TVal and FVal if necessary. + if (Swap) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + + if (Opcode != AArch64ISD::CSEL) { + // Drop FVal since we can get its value by simply inverting/negating + // TVal. + FVal = TVal; + } + } + + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + + EVT VT = TVal.getValueType(); + return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); + } + + // Now we know we're dealing with FP values. + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == RHS.getValueType()); + EVT VT = TVal.getValueType(); + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two CSELs to implement. + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); + SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); + + // If we need a second CSEL, emit it, using the output of the first as the + // RHS. We're effectively OR'ing the two CC's together. + if (CC2 != AArch64CC::AL) { + SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + } + + // Otherwise, return the output of the first CSEL. + return CS1; +} + +SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, + SelectionDAG &DAG) const { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue TVal = Op.getOperand(2); + SDValue FVal = Op.getOperand(3); + SDLoc DL(Op); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); +} + +SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue CCVal = Op->getOperand(0); + SDValue TVal = Op->getOperand(1); + SDValue FVal = Op->getOperand(2); + SDLoc DL(Op); + + unsigned Opc = CCVal.getOpcode(); + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select + // instruction. + if (CCVal.getResNo() == 1 && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) + return SDValue(); + + AArch64CC::CondCode OFCC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); + SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); + + return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, + CCVal, Overflow); + } + + // Lower it the same way as we would lower a SELECT_CC node. + ISD::CondCode CC; + SDValue LHS, RHS; + if (CCVal.getOpcode() == ISD::SETCC) { + LHS = CCVal.getOperand(0); + RHS = CCVal.getOperand(1); + CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get(); + } else { + LHS = CCVal; + RHS = DAG.getConstant(0, DL, CCVal.getValueType()); + CC = ISD::SETNE; + } + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); +} + +SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + // Jump table entries as PC relative offsets. No additional tweaking + // is necessary here. Just get the address of the jump table. + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + + if (getTargetMachine().getCodeModel() == CodeModel::Large && + !Subtarget->isTargetMachO()) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + AArch64II::MO_G0 | MO_NC)); + } + + SDValue Hi = + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); +} + +SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + // Use the GOT for the large code model on iOS. + if (Subtarget->isTargetMachO()) { + SDValue GotAddr = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), + AArch64II::MO_GOT); + return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + } + + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G3), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G2 | MO_NC), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G1 | MO_NC), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + } else { + // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on + // ELF, the only valid one on Darwin. + SDValue Hi = + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + if (getTargetMachine().getCodeModel() == CodeModel::Large && + !Subtarget->isTargetMachO()) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + } else { + SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + + SDLoc DL(Op); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + +SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, + SelectionDAG &DAG) const { + // The layout of the va_list struct is specified in the AArch64 Procedure Call + // Standard, section B.3. + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc DL(Op); + + SDValue Chain = Op.getOperand(0); + SDValue VAList = Op.getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + SmallVector<SDValue, 4> MemOps; + + // void *__stack at offset 0 + SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); + MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, + MachinePointerInfo(SV), false, false, 8)); + + // void *__gr_top at offset 8 + int GPRSize = FuncInfo->getVarArgsGPRSize(); + if (GPRSize > 0) { + SDValue GRTop, GRTopAddr; + + GRTopAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); + + GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); + GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, + DAG.getConstant(GPRSize, DL, PtrVT)); + + MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, + MachinePointerInfo(SV, 8), false, false, 8)); + } + + // void *__vr_top at offset 16 + int FPRSize = FuncInfo->getVarArgsFPRSize(); + if (FPRSize > 0) { + SDValue VRTop, VRTopAddr; + VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(16, DL, PtrVT)); + + VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); + VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, + DAG.getConstant(FPRSize, DL, PtrVT)); + + MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, + MachinePointerInfo(SV, 16), false, false, 8)); + } + + // int __gr_offs at offset 24 + SDValue GROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); + MemOps.push_back(DAG.getStore(Chain, DL, + DAG.getConstant(-GPRSize, DL, MVT::i32), + GROffsAddr, MachinePointerInfo(SV, 24), false, + false, 4)); + + // int __vr_offs at offset 28 + SDValue VROffsAddr = + DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); + MemOps.push_back(DAG.getStore(Chain, DL, + DAG.getConstant(-FPRSize, DL, MVT::i32), + VROffsAddr, MachinePointerInfo(SV, 28), false, + false, 4)); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); +} + +SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, + SelectionDAG &DAG) const { + return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) + : LowerAAPCS_VASTART(Op, DAG); +} + +SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, + SelectionDAG &DAG) const { + // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single + // pointer. + SDLoc DL(Op); + unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); + const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + + return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), + Op.getOperand(2), + DAG.getConstant(VaListSize, DL, MVT::i32), + 8, false, false, false, MachinePointerInfo(DestSV), + MachinePointerInfo(SrcSV)); +} + +SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && + "automatic va_arg instruction only works on Darwin"); + + const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + unsigned Align = Op.getConstantOperandVal(3); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V), + false, false, false, 0); + Chain = VAList.getValue(1); + + if (Align > 8) { + assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); + VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(Align - 1, DL, PtrVT)); + VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, + DAG.getConstant(-(int64_t)Align, DL, PtrVT)); + } + + Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); + uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); + + // Scalar integer and FP values smaller than 64 bits are implicitly extended + // up to 64 bits. At the very least, we have to increase the striding of the + // vaargs list to match this, and for FP values we need to introduce + // FP_ROUND nodes as well. + if (VT.isInteger() && !VT.isVector()) + ArgSize = 8; + bool NeedFPTrunc = false; + if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { + ArgSize = 8; + NeedFPTrunc = true; + } + + // Increment the pointer, VAList, to the next vaarg + SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, + DAG.getConstant(ArgSize, DL, PtrVT)); + // Store the incremented VAList to the legalized pointer + SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), + false, false, 0); + + // Load the actual argument out of the pointer VAList + if (NeedFPTrunc) { + // Load the value as an f64. + SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, + MachinePointerInfo(), false, false, false, 0); + // Round the value down to an f32. + SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), + DAG.getIntPtrConstant(1, DL)); + SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; + // Merge the rounded value with the chain output of the load. + return DAG.getMergeValues(Ops, DL); + } + + return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, + false, false, 0); +} + +SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDValue FrameAddr = + DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), false, false, false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("sp", AArch64::SP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error(Twine("Invalid register name \"" + + StringRef(RegName) + "\".")); +} + +SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); + return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); +} + +/// LowerShiftRightParts - Lower SRA_PARTS, which returns two +/// i64 values and take a 2 x i64 value to shift plus a shift amount. +SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i64)); + + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); + + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); + + // AArch64 shifts larger than the register width are wrapped rather than + // clamped, so we can't just emit "hi >> x". + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + + +/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two +/// i64 values and take a 2 x i64 value to shift plus a shift amount. +SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + assert(Op.getOpcode() == ISD::SHL_PARTS); + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i64)); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); + + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); + + // AArch64 shifts of larger than register sizes are wrapped rather than + // clamped, so we can't just emit "lo << a" if a is too big. + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +bool AArch64TargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode *GA) const { + // The AArch64 target doesn't support folding offsets into global addresses. + return false; +} + +bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. + // FIXME: We should be able to handle f128 as well with a clever lowering. + if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) + return true; + + if (VT == MVT::f64) + return AArch64_AM::getFP64Imm(Imm) != -1; + else if (VT == MVT::f32) + return AArch64_AM::getFP32Imm(Imm) != -1; + return false; +} + +//===----------------------------------------------------------------------===// +// AArch64 Optimization Hooks +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AArch64 Inline Assembly Support +//===----------------------------------------------------------------------===// + +// Table of Constraints +// TODO: This is the current set of constraints supported by ARM for the +// compiler, not all of them may make sense, e.g. S may be difficult to support. +// +// r - A general register +// w - An FP/SIMD register of some size in the range v0-v31 +// x - An FP/SIMD register of some size in the range v0-v15 +// I - Constant that can be used with an ADD instruction +// J - Constant that can be used with a SUB instruction +// K - Constant that can be used with a 32-bit logical instruction +// L - Constant that can be used with a 64-bit logical instruction +// M - Constant that can be used as a 32-bit MOV immediate +// N - Constant that can be used as a 64-bit MOV immediate +// Q - A memory reference with base register and no offset +// S - A symbolic address +// Y - Floating point constant zero +// Z - Integer constant zero +// +// Note that general register operands will be output using their 64-bit x +// register name, whatever the size of the variable, unless the asm operand +// is prefixed by the %w modifier. Floating-point and SIMD register operands +// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or +// %q modifier. + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +AArch64TargetLowering::ConstraintType +AArch64TargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'z': + return C_Other; + case 'x': + case 'w': + return C_RegisterClass; + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as 'r'. + case 'Q': + return C_Memory; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +AArch64TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'x': + case 'w': + if (type->isFloatingPointTy() || type->isVectorTy()) + weight = CW_Register; + break; + case 'z': + weight = CW_Constant; + break; + } + return weight; +} + +std::pair<unsigned, const TargetRegisterClass *> +AArch64TargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'r': + if (VT.getSizeInBits() == 64) + return std::make_pair(0U, &AArch64::GPR64commonRegClass); + return std::make_pair(0U, &AArch64::GPR32commonRegClass); + case 'w': + if (VT == MVT::f32) + return std::make_pair(0U, &AArch64::FPR32RegClass); + if (VT.getSizeInBits() == 64) + return std::make_pair(0U, &AArch64::FPR64RegClass); + if (VT.getSizeInBits() == 128) + return std::make_pair(0U, &AArch64::FPR128RegClass); + break; + // The instructions that this constraint is designed for can + // only take 128-bit registers so just use that regclass. + case 'x': + if (VT.getSizeInBits() == 128) + return std::make_pair(0U, &AArch64::FPR128_loRegClass); + break; + } + } + if (StringRef("{cc}").equals_lower(Constraint)) + return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair<unsigned, const TargetRegisterClass *> Res; + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + + // Not found as a standard register? + if (!Res.second) { + unsigned Size = Constraint.size(); + if ((Size == 4 || Size == 5) && Constraint[0] == '{' && + tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { + int RegNo; + bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); + if (!Failed && RegNo >= 0 && RegNo <= 31) { + // v0 - v31 are aliases of q0 - q31. + // By default we'll emit v0-v31 for this unless there's a modifier where + // we'll emit the correct register as well. + Res.first = AArch64::FPR128RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR128RegClass; + } + } + } + + return Res; +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void AArch64TargetLowering::LowerAsmOperandForConstraint( + SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Currently only support length 1 constraints. + if (Constraint.length() != 1) + return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: + break; + + // This set of constraints deal with valid constants for various instructions. + // Validate and return a target constant for them if we can. + case 'z': { + // 'z' maps to xzr or wzr so it needs an input of 0. + if (!isNullConstant(Op)) + return; + + if (Op.getValueType() == MVT::i64) + Result = DAG.getRegister(AArch64::XZR, MVT::i64); + else + Result = DAG.getRegister(AArch64::WZR, MVT::i32); + break; + } + + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) + return; + + // Grab the value and do some validation. + uint64_t CVal = C->getZExtValue(); + switch (ConstraintLetter) { + // The I constraint applies only to simple ADD or SUB immediate operands: + // i.e. 0 to 4095 with optional shift by 12 + // The J constraint applies only to ADD or SUB immediates that would be + // valid when negated, i.e. if [an add pattern] were to be output as a SUB + // instruction [or vice versa], in other words -1 to -4095 with optional + // left shift by 12. + case 'I': + if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) + break; + return; + case 'J': { + uint64_t NVal = -C->getSExtValue(); + if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { + CVal = C->getSExtValue(); + break; + } + return; + } + // The K and L constraints apply *only* to logical immediates, including + // what used to be the MOVI alias for ORR (though the MOVI alias has now + // been removed and MOV should be used). So these constraints have to + // distinguish between bit patterns that are valid 32-bit or 64-bit + // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but + // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice + // versa. + case 'K': + if (AArch64_AM::isLogicalImmediate(CVal, 32)) + break; + return; + case 'L': + if (AArch64_AM::isLogicalImmediate(CVal, 64)) + break; + return; + // The M and N constraints are a superset of K and L respectively, for use + // with the MOV (immediate) alias. As well as the logical immediates they + // also match 32 or 64-bit immediates that can be loaded either using a + // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca + // (M) or 64-bit 0x1234000000000000 (N) etc. + // As a note some of this code is liberally stolen from the asm parser. + case 'M': { + if (!isUInt<32>(CVal)) + return; + if (AArch64_AM::isLogicalImmediate(CVal, 32)) + break; + if ((CVal & 0xFFFF) == CVal) + break; + if ((CVal & 0xFFFF0000ULL) == CVal) + break; + uint64_t NCVal = ~(uint32_t)CVal; + if ((NCVal & 0xFFFFULL) == NCVal) + break; + if ((NCVal & 0xFFFF0000ULL) == NCVal) + break; + return; + } + case 'N': { + if (AArch64_AM::isLogicalImmediate(CVal, 64)) + break; + if ((CVal & 0xFFFFULL) == CVal) + break; + if ((CVal & 0xFFFF0000ULL) == CVal) + break; + if ((CVal & 0xFFFF00000000ULL) == CVal) + break; + if ((CVal & 0xFFFF000000000000ULL) == CVal) + break; + uint64_t NCVal = ~CVal; + if ((NCVal & 0xFFFFULL) == NCVal) + break; + if ((NCVal & 0xFFFF0000ULL) == NCVal) + break; + if ((NCVal & 0xFFFF00000000ULL) == NCVal) + break; + if ((NCVal & 0xFFFF000000000000ULL) == NCVal) + break; + return; + } + default: + return; + } + + // All assembler immediates are 64-bit integers. + Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); + break; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +//===----------------------------------------------------------------------===// +// AArch64 Advanced SIMD Support +//===----------------------------------------------------------------------===// + +/// WidenVector - Given a value in the V64 register class, produce the +/// equivalent value in the V128 register class. +static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { + EVT VT = V64Reg.getValueType(); + unsigned NarrowSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); + SDLoc DL(V64Reg); + + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), + V64Reg, DAG.getConstant(0, DL, MVT::i32)); +} + +/// getExtFactor - Determine the adjustment factor for the position when +/// generating an "extract from vector registers" instruction. +static unsigned getExtFactor(SDValue &V) { + EVT EltType = V.getValueType().getVectorElementType(); + return EltType.getSizeInBits() / 8; +} + +/// NarrowVector - Given a value in the V128 register class, produce the +/// equivalent value in the V64 register class. +static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { + EVT VT = V128Reg.getValueType(); + unsigned WideSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); + SDLoc DL(V128Reg); + + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); +} + +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector<ShuffleSourceInfo, 2> Sources; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } + + // Add this element source to the list if it's not already there. + SDValue SourceVec = V.getOperand(0); + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); + + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); + } + + // Currently only do something sane when at most two source vectors + // are involved. + if (Sources.size() > 2) + return SDValue(); + + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) { + SmallestEltTy = SrcEltTy; + } + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); + + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); + continue; + } + + assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); + + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (Src.MinElt >= NumSrcElts) { + // The extraction can just take the second half + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i64)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { + // The extraction can just take the first half + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i64)); + } else { + // An actual VEXT is needed + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i64)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i64)); + unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); + + Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Imm, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; + } + } + + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) + continue; + + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; + } + + // Final check before we try to produce nonsense... + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); +} + +// check if an EXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are the same. +static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, just follow it + // back to index zero and keep going. + ++ExpectedElt; + if (ExpectedElt == NumElts) + ExpectedElt = 0; + + if (M[i] < 0) + continue; // ignore UNDEF indices + if (ExpectedElt != static_cast<unsigned>(M[i])) + return false; + } + + return true; +} + +// check if an EXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are different. +static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, + unsigned &Imm) { + // Look for the first non-undef element. + const int *FirstRealElt = std::find_if(M.begin(), M.end(), + [](int Elt) {return Elt >= 0;}); + + // Benefit form APInt to handle overflow when calculating expected element. + unsigned NumElts = VT.getVectorNumElements(); + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + // The following shuffle indices must be the successive elements after the + // first real element. + const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), + [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); + if (FirstWrongElt != M.end()) + return false; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + Imm = ExpectedElt.getZExtValue(); + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseEXT = true; + else + Imm -= NumElts; + + return true; +} + +/// isREVMask - Check if a vector shuffle corresponds to a REV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != Idx) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) + return false; + Idx += 1; + } + + return true; +} + +static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != 2 * i + WhichResult) + return false; + } + + return true; +} + +static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. +static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != Idx) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) + return false; + Idx += 1; + } + + return true; +} + +/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, +static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned Half = VT.getVectorNumElements() / 2; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned j = 0; j != 2; ++j) { + unsigned Idx = WhichResult; + for (unsigned i = 0; i != Half; ++i) { + int MIdx = M[i + j * Half]; + if (MIdx >= 0 && (unsigned)MIdx != Idx) + return false; + Idx += 2; + } + } + + return true; +} + +/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. +static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) + return false; + } + return true; +} + +static bool isINSMask(ArrayRef<int> M, int NumInputElements, + bool &DstIsLeft, int &Anomaly) { + if (M.size() != static_cast<size_t>(NumInputElements)) + return false; + + int NumLHSMatch = 0, NumRHSMatch = 0; + int LastLHSMismatch = -1, LastRHSMismatch = -1; + + for (int i = 0; i < NumInputElements; ++i) { + if (M[i] == -1) { + ++NumLHSMatch; + ++NumRHSMatch; + continue; + } + + if (M[i] == i) + ++NumLHSMatch; + else + LastLHSMismatch = i; + + if (M[i] == i + NumInputElements) + ++NumRHSMatch; + else + LastRHSMismatch = i; + } + + if (NumLHSMatch == NumInputElements - 1) { + DstIsLeft = true; + Anomaly = LastLHSMismatch; + return true; + } else if (NumRHSMatch == NumInputElements - 1) { + DstIsLeft = false; + Anomaly = LastRHSMismatch; + return true; + } + + return false; +} + +static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { + if (VT.getSizeInBits() != 128) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + + for (int I = 0, E = NumElts / 2; I != E; I++) { + if (Mask[I] != I) + return false; + } + + int Offset = NumElts / 2; + for (int I = NumElts / 2, E = NumElts; I != E; I++) { + if (Mask[I] != I + SplitLHS * Offset) + return false; + } + + return true; +} + +static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); + + if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || + VT.getVectorElementType() != V1.getValueType().getVectorElementType()) + return SDValue(); + + bool SplitV0 = V0.getValueType().getSizeInBits() == 128; + + if (!isConcatMask(Mask, VT, SplitV0)) + return SDValue(); + + EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + if (SplitV0) { + V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, + DAG.getConstant(0, DL, MVT::i64)); + } + if (V1.getValueType().getSizeInBits() == 128) { + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, + DAG.getConstant(0, DL, MVT::i64)); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + SDLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1 * 9 + 2) * 9 + 3) + return LHS; + assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: + llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + // VREV divides the vector in half and swaps within the half. + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) + return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); + // vrev <4 x i16> -> REV32 + if (VT.getVectorElementType() == MVT::i16 || + VT.getVectorElementType() == MVT::f16) + return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); + // vrev <4 x i8> -> REV16 + assert(VT.getVectorElementType() == MVT::i8); + return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: { + EVT EltTy = VT.getVectorElementType(); + unsigned Opcode; + if (EltTy == MVT::i8) + Opcode = AArch64ISD::DUPLANE8; + else if (EltTy == MVT::i16 || EltTy == MVT::f16) + Opcode = AArch64ISD::DUPLANE16; + else if (EltTy == MVT::i32 || EltTy == MVT::f32) + Opcode = AArch64ISD::DUPLANE32; + else if (EltTy == MVT::i64 || EltTy == MVT::f64) + Opcode = AArch64ISD::DUPLANE64; + else + llvm_unreachable("Invalid vector element type?"); + + if (VT.getSizeInBits() == 64) + OpLHS = WidenVector(OpLHS, DAG); + SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); + return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); + } + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: { + unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); + return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, + DAG.getConstant(Imm, dl, MVT::i32)); + } + case OP_VUZPL: + return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VUZPR: + return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VZIPL: + return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VZIPR: + return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VTRNL: + return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VTRNR: + return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + } +} + +static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the TBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc DL(Op); + + EVT EltVT = Op.getValueType().getVectorElementType(); + unsigned BytesPerElt = EltVT.getSizeInBits() / 8; + + SmallVector<SDValue, 8> TBLMask; + for (int Val : ShuffleMask) { + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); + } + } + + MVT IndexVT = MVT::v8i8; + unsigned IndexLen = 8; + if (Op.getValueType().getSizeInBits() == 128) { + IndexVT = MVT::v16i8; + IndexLen = 16; + } + + SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); + SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); + + SDValue Shuffle; + if (V2.getNode()->getOpcode() == ISD::UNDEF) { + if (IndexLen == 8) + V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } else { + if (IndexLen == 8) { + V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } else { + // FIXME: We cannot, for the moment, emit a TBL2 instruction because we + // cannot currently represent the register constraints on the input + // table registers. + // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, + // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + // &TBLMask[0], IndexLen)); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), + V1Cst, V2Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } + } + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); +} + +static unsigned getDUPLANEOp(EVT EltType) { + if (EltType == MVT::i8) + return AArch64ISD::DUPLANE8; + if (EltType == MVT::i16 || EltType == MVT::f16) + return AArch64ISD::DUPLANE16; + if (EltType == MVT::i32 || EltType == MVT::f32) + return AArch64ISD::DUPLANE32; + if (EltType == MVT::i64 || EltType == MVT::f64) + return AArch64ISD::DUPLANE64; + + llvm_unreachable("Invalid vector element type?"); +} + +SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + ArrayRef<int> ShuffleMask = SVN->getMask(); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], + V1.getValueType().getSimpleVT())) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) + Lane = 0; + + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) + return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), + V1.getOperand(0)); + // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- + // constant. If so, we can just reference the lane's definition directly. + if (V1.getOpcode() == ISD::BUILD_VECTOR && + !isa<ConstantSDNode>(V1.getOperand(Lane))) + return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); + + // Otherwise, duplicate from the lane of the input vector. + unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); + + // SelectionDAGBuilder may have "helpfully" already extracted or conatenated + // to make a vector of the same size as this SHUFFLE. We can ignore the + // extract entirely, and canonicalise the concat using WidenVector. + if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); + V1 = V1.getOperand(0); + } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { + unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; + Lane -= Idx * VT.getVectorNumElements() / 2; + V1 = WidenVector(V1.getOperand(Idx), DAG); + } else if (VT.getSizeInBits() == 64) + V1 = WidenVector(V1, DAG); + + return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); + } + + if (isREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); + if (isREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); + if (isREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); + + bool ReverseEXT = false; + unsigned Imm; + if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { + if (ReverseEXT) + std::swap(V1, V2); + Imm *= getExtFactor(V1); + return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, + DAG.getConstant(Imm, dl, MVT::i32)); + } else if (V2->getOpcode() == ISD::UNDEF && + isSingletonEXTMask(ShuffleMask, VT, Imm)) { + Imm *= getExtFactor(V1); + return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, + DAG.getConstant(Imm, dl, MVT::i32)); + } + + unsigned WhichResult; + if (isZIPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + if (isUZPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + if (isTRNMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + + SDValue Concat = tryFormConcatFromShuffle(Op, DAG); + if (Concat.getNode()) + return Concat; + + bool DstIsLeft; + int Anomaly; + int NumInputElements = V1.getValueType().getVectorNumElements(); + if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { + SDValue DstVec = DstIsLeft ? V1 : V2; + SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); + + SDValue SrcVec = V1; + int SrcLane = ShuffleMask[Anomaly]; + if (SrcLane >= NumInputElements) { + SrcVec = V2; + SrcLane -= VT.getVectorNumElements(); + } + SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); + + EVT ScalarVT = VT.getVectorElementType(); + + if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) + ScalarVT = MVT::i32; + + return DAG.getNode( + ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), + DstLaneV); + } + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + return GenerateTBL(Op, ShuffleMask, DAG); +} + +static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, + APInt &UndefBits) { + EVT VT = BVN->getValueType(0); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; + + for (unsigned i = 0; i < NumSplats; ++i) { + CnstBits <<= SplatBitSize; + UndefBits <<= SplatBitSize; + CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); + UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); + } + + return true; + } + + return false; +} + +SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, + SelectionDAG &DAG) const { + BuildVectorSDNode *BVN = + dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); + SDValue LHS = Op.getOperand(0); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + if (!BVN) + return Op; + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We only have BIC vector immediate instruction, which is and-not. + CnstBits = ~CnstBits; + + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = ~UndefBits; + goto AttemptModImm; + } + +// We can always fall back to a non-immediate AND. +FailedModImm: + return Op; +} + +// Specialized code to quickly find if PotentialBVec is a BuildVector that +// consists of only the same constant int value, returned in reference arg +// ConstVal +static bool isAllConstantBuildVector(const SDValue &PotentialBVec, + uint64_t &ConstVal) { + BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); + if (!Bvec) + return false; + ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); + if (!FirstElt) + return false; + EVT VT = Bvec->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 1; i < NumElts; ++i) + if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) + return false; + ConstVal = FirstElt->getZExtValue(); + return true; +} + +static unsigned getIntrinsicID(const SDNode *N) { + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + default: + return Intrinsic::not_intrinsic; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + if (IID < Intrinsic::num_intrinsics) + return IID; + return Intrinsic::not_intrinsic; + } + } +} + +// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), +// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a +// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. +// Also, logical shift right -> sri, with the same structure. +static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + if (!VT.isVector()) + return SDValue(); + + SDLoc DL(N); + + // Is the first op an AND? + const SDValue And = N->getOperand(0); + if (And.getOpcode() != ISD::AND) + return SDValue(); + + // Is the second op an shl or lshr? + SDValue Shift = N->getOperand(1); + // This will have been turned into: AArch64ISD::VSHL vector, #shift + // or AArch64ISD::VLSHR vector, #shift + unsigned ShiftOpc = Shift.getOpcode(); + if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) + return SDValue(); + bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + + // Is the shift amount constant? + ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); + if (!C2node) + return SDValue(); + + // Is the and mask vector all constant? + uint64_t C1; + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + + // Is C1 == ~C2, taking into account how much one can shift elements of a + // particular size? + uint64_t C2 = C2node->getZExtValue(); + unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); + if (C2 > ElemSizeInBits) + return SDValue(); + unsigned ElemMask = (1 << ElemSizeInBits) - 1; + if ((C1 & ElemMask) != (~C2 & ElemMask)) + return SDValue(); + + SDValue X = And.getOperand(0); + SDValue Y = Shift.getOperand(0); + + unsigned Intrin = + IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; + SDValue ResultSLI = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrin, DL, MVT::i32), X, Y, + Shift.getOperand(1)); + + DEBUG(dbgs() << "aarch64-lower: transformed: \n"); + DEBUG(N->dump(&DAG)); + DEBUG(dbgs() << "into: \n"); + DEBUG(ResultSLI->dump(&DAG)); + + ++NumShiftInserts; + return ResultSLI; +} + +SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, + SelectionDAG &DAG) const { + // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) + if (EnableAArch64SlrGeneration) { + SDValue Res = tryLowerToSLI(Op.getNode(), DAG); + if (Res.getNode()) + return Res; + } + + BuildVectorSDNode *BVN = + dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); + SDValue LHS = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + // OR commutes, so try swapping the operands. + if (!BVN) { + LHS = Op.getOperand(0); + BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); + } + if (!BVN) + return Op; + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = UndefBits; + goto AttemptModImm; + } + +// We can always fall back to a non-immediate OR. +FailedModImm: + return Op; +} + +// Normalize the operands of BUILD_VECTOR. The value of constant operands will +// be truncated to fit element width. +static SDValue NormalizeBuildVector(SDValue Op, + SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT EltTy= VT.getVectorElementType(); + + if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) + return Op; + + SmallVector<SDValue, 16> Ops; + for (SDValue Lane : Op->ops()) { + if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { + APInt LowBits(EltTy.getSizeInBits(), + CstLane->getZExtValue()); + Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); + } + Ops.push_back(Lane); + } + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); +} + +SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + Op = NormalizeBuildVector(Op, DAG); + BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + // Certain magic vector constants (used to express things like NOT + // and NEG) are passed through unmodified. This allows codegen patterns + // for these operations to match. Special-purpose patterns will lower + // these immediates to MOVIs if it proves necessary. + if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) + return Op; + + // The many faces of MOVI... + if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); + if (VT.getSizeInBits() == 128) { + SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, + DAG.getConstant(CnstVal, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + // Support the V64 version via subregister insertion. + SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, + DAG.getConstant(CnstVal, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(264, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(272, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; + SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + // The few faces of FMOV... + if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; + SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && + VT.getSizeInBits() == 128) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); + SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, + DAG.getConstant(CnstVal, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + // The many faces of MVNI... + CnstVal = ~CnstVal; + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(16, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(24, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(8, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(264, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, + DAG.getConstant(CnstVal, dl, MVT::i32), + DAG.getConstant(272, dl, MVT::i32)); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = UndefBits; + goto AttemptModImm; + } +FailedModImm: + + // Scan through the operands to find some interesting properties we can + // exploit: + // 1) If only one value is used, we can use a DUP, or + // 2) if only the low element is not undef, we can just insert that, or + // 3) if only one constant value is used (w/ some non-constant lanes), + // we can splat the constant value into the whole vector then fill + // in the non-constant lanes. + // 4) FIXME: If different constant values are used, but we can intelligently + // select the values we'll be overwriting for the non-constant + // lanes such that we can directly materialize the vector + // some other way (MOVI, e.g.), we can be sneaky. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool usesOnlyOneConstantValue = true; + bool isConstant = true; + unsigned NumConstantLanes = 0; + SDValue Value; + SDValue ConstantValue; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) + isConstant = false; + + if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { + ++NumConstantLanes; + if (!ConstantValue.getNode()) + ConstantValue = V; + else if (ConstantValue != V) + usesOnlyOneConstantValue = false; + } + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + // Use DUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (usesOnlyOneValue) { + if (!isConstant) { + if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Value.getValueType() != VT) + return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); + + // This is actually a DUPLANExx operation, which keeps everything vectory. + + // DUPLANE works on 128-bit vectors, widen it if necessary. + SDValue Lane = Value.getOperand(1); + Value = Value.getOperand(0); + if (Value.getValueType().getSizeInBits() == 64) + Value = WidenVector(Value, DAG); + + unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); + return DAG.getNode(Opcode, dl, VT, Value, Lane); + } + + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector<SDValue, 8> Ops; + EVT EltTy = VT.getVectorElementType(); + assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && + "Unsupported floating-point vector type"); + MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + Val = LowerBUILD_VECTOR(Val, DAG); + if (Val.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + } + + // If there was only one constant value used and for more than one lane, + // start by splatting that value, then replace the non-constant lanes. This + // is better than the default, which will perform a separate initialization + // for each lane. + if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { + SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); + // Now insert the non-constant lanes. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); + if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { + // Note that type legalization likely mucked about with the VT of the + // source operand, so we may have to convert it here before inserting. + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); + } + } + return Val; + } + + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) + return shuffle; + } + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + SDValue Op0 = Op.getOperand(0); + unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); + unsigned i = 0; + // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + // a) Avoid a RMW dependency on the full vector register, and + // b) Allow the register coalescer to fold away the copy if the + // value is already in an S or D register. + // Do not do this for UNDEF/LOAD nodes because we have better patterns + // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. + if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + (ElemSize == 32 || ElemSize == 64)) { + unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; + MachineSDNode *N = + DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, + DAG.getTargetConstant(SubIdx, dl, MVT::i32)); + Vec = SDValue(N, 0); + ++i; + } + for (; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + + // Just use the default expansion. We failed to find a better alternative. + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); + + // Check for non-constant or out of range lane. + EVT VT = Op.getOperand(0).getValueType(); + ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) + return SDValue(); + + + // Insertion/extraction are legal for V128 types. + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) + return Op; + + if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + return SDValue(); + + // For V64 types, we perform insertion by expanding the value + // to a V128 type and perform the insertion on that. + SDLoc DL(Op); + SDValue WideVec = WidenVector(Op.getOperand(0), DAG); + EVT WideTy = WideVec.getValueType(); + + SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, + Op.getOperand(1), Op.getOperand(2)); + // Re-narrow the resultant vector. + return NarrowVector(Node, DAG); +} + +SDValue +AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); + + // Check for non-constant or out of range lane. + EVT VT = Op.getOperand(0).getValueType(); + ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) + return SDValue(); + + + // Insertion/extraction are legal for V128 types. + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) + return Op; + + if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) + return SDValue(); + + // For V64 types, we perform extraction by expanding the value + // to a V128 type and perform the extraction on that. + SDLoc DL(Op); + SDValue WideVec = WidenVector(Op.getOperand(0), DAG); + EVT WideTy = WideVec.getValueType(); + + EVT ExtrTy = WideTy.getVectorElementType(); + if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) + ExtrTy = MVT::i32; + + // For extractions, we just return the result directly. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, + Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + // Just in case... + if (!VT.isVector()) + return SDValue(); + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Cst) + return SDValue(); + unsigned Val = Cst->getZExtValue(); + + unsigned Size = Op.getValueType().getSizeInBits(); + + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. + if (Val == 0) + return Op; + + // If this is extracting the upper 64-bits of a 128-bit vector, we match + // that directly. + if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) + return Op; + + return SDValue(); +} + +bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool DummyBool; + int DummyInt; + unsigned DummyUnsigned; + + return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || + isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || + isEXTMask(M, VT, DummyBool, DummyUnsigned) || + // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. + isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || + isZIPMask(M, VT, DummyUnsigned) || + isTRN_v_undef_Mask(M, VT, DummyUnsigned) || + isUZP_v_undef_Mask(M, VT, DummyUnsigned) || + isZIP_v_undef_Mask(M, VT, DummyUnsigned) || + isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || + isConcatMask(M, VT, VT.getSizeInBits() == 128)); +} + +/// getVShiftImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift; or +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); +} + +SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + int64_t Cnt; + + if (!Op.getOperand(1).getValueType().isVector()) + return Op; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + switch (Op.getOpcode()) { + default: + llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) + return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), + DAG.getConstant(Cnt, DL, MVT::i32)); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, + MVT::i32), + Op.getOperand(0), Op.getOperand(1)); + case ISD::SRA: + case ISD::SRL: + // Right shift immediate + if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { + unsigned Opc = + (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; + return DAG.getNode(Opc, DL, VT, Op.getOperand(0), + DAG.getConstant(Cnt, DL, MVT::i32)); + } + + // Right shift register. Note, there is not a shift right register + // instruction, but the shift left register instruction takes a signed + // value, where negative numbers specify a right shift. + unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl + : Intrinsic::aarch64_neon_ushl; + // negate the shift amount + SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); + SDValue NegShiftLeft = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), + NegShift); + return NegShiftLeft; + } + + return SDValue(); +} + +static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, + AArch64CC::CondCode CC, bool NoNans, EVT VT, + SDLoc dl, SelectionDAG &DAG) { + EVT SrcVT = LHS.getValueType(); + assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && + "function only supposed to emit natural comparisons"); + + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); + bool IsZero = IsCnst && (CnstBits == 0); + + if (SrcVT.getVectorElementType().isFloatingPoint()) { + switch (CC) { + default: + return SDValue(); + case AArch64CC::NE: { + SDValue Fcmeq; + if (IsZero) + Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); + else + Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); + return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); + } + case AArch64CC::EQ: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); + case AArch64CC::GE: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); + case AArch64CC::GT: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); + case AArch64CC::LS: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); + case AArch64CC::LT: + if (!NoNans) + return SDValue(); + // If we ignore NaNs then we can use to the MI implementation. + // Fallthrough. + case AArch64CC::MI: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); + } + } + + switch (CC) { + default: + return SDValue(); + case AArch64CC::NE: { + SDValue Cmeq; + if (IsZero) + Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); + else + Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); + return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); + } + case AArch64CC::EQ: + if (IsZero) + return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); + case AArch64CC::GE: + if (IsZero) + return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); + case AArch64CC::GT: + if (IsZero) + return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); + case AArch64CC::LE: + if (IsZero) + return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); + case AArch64CC::LS: + return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); + case AArch64CC::LO: + return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); + case AArch64CC::LT: + if (IsZero) + return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); + case AArch64CC::HI: + return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); + case AArch64CC::HS: + return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); + } +} + +SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, + SelectionDAG &DAG) const { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); + SDLoc dl(Op); + + if (LHS.getValueType().getVectorElementType().isInteger()) { + assert(LHS.getValueType() == RHS.getValueType()); + AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + SDValue Cmp = + EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); + return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); + } + + assert(LHS.getValueType().getVectorElementType() == MVT::f32 || + LHS.getValueType().getVectorElementType() == MVT::f64); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two branches to implement. + AArch64CC::CondCode CC1, CC2; + bool ShouldInvert; + changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); + + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; + SDValue Cmp = + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); + if (!Cmp.getNode()) + return SDValue(); + + if (CC2 != AArch64CC::AL) { + SDValue Cmp2 = + EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); + if (!Cmp2.getNode()) + return SDValue(); + + Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); + } + + Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); + + if (ShouldInvert) + return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); + + return Cmp; +} + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + auto &DL = I.getModule()->getDataLayout(); + switch (Intrinsic) { + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_ld1x2: + case Intrinsic::aarch64_neon_ld1x3: + case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld2lane: + case Intrinsic::aarch64_neon_ld3lane: + case Intrinsic::aarch64_neon_ld4lane: + case Intrinsic::aarch64_neon_ld2r: + case Intrinsic::aarch64_neon_ld3r: + case Intrinsic::aarch64_neon_ld4r: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); + Info.offset = 0; + Info.align = 0; + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + case Intrinsic::aarch64_neon_st1x2: + case Intrinsic::aarch64_neon_st1x3: + case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st2lane: + case Intrinsic::aarch64_neon_st3lane: + case Intrinsic::aarch64_neon_st4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); + Info.offset = 0; + Info.align = 0; + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::aarch64_ldaxr: + case Intrinsic::aarch64_ldxr: { + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_stlxr: + case Intrinsic::aarch64_stxr: { + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::aarch64_ldaxp: + case Intrinsic::aarch64_ldxp: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 16; + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_stlxp: + case Intrinsic::aarch64_stxp: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = 16; + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} + +// Truncations from 64-bit GPR to 32-bit GPR is free. +bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 > NumBits2; +} +bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 > NumBits2; +} + +/// Check if it is profitable to hoist instruction in then/else to if. +/// Not profitable if I and it's user can form a FMA instruction +/// because we prefer FMSUB/FMADD. +bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (I->getNumUses() != 1) + return true; + + Instruction *User = I->user_back(); + + if (User && + !(User->getOpcode() == Instruction::FSub || + User->getOpcode() == Instruction::FAdd)) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + const DataLayout &DL = I->getModule()->getDataLayout(); + EVT VT = getValueType(DL, User->getOperand(0)->getType()); + + if (isFMAFasterThanFMulAndFAdd(VT) && + isOperationLegalOrCustom(ISD::FMA, VT) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)) + return false; + + return true; +} + +// All 32-bit GPR operations implicitly zero the high-half of the corresponding +// 64-bit GPR. +bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 == 32 && NumBits2 == 64; +} +bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { + if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 == 32 && NumBits2 == 64; +} + +bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + EVT VT1 = Val.getValueType(); + if (isZExtFree(VT1, VT2)) { + return true; + } + + if (Val.getOpcode() != ISD::LOAD) + return false; + + // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. + return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && + VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && + VT1.getSizeInBits() <= 32); +} + +bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { + if (isa<FPExtInst>(Ext)) + return false; + + // Vector types are next free. + if (Ext->getType()->isVectorTy()) + return false; + + for (const Use &U : Ext->uses()) { + // The extension is free if we can fold it with a left shift in an + // addressing mode or an arithmetic operation: add, sub, and cmp. + + // Is there a shift? + const Instruction *Instr = cast<Instruction>(U.getUser()); + + // Is this a constant shift? + switch (Instr->getOpcode()) { + case Instruction::Shl: + if (!isa<ConstantInt>(Instr->getOperand(1))) + return false; + break; + case Instruction::GetElementPtr: { + gep_type_iterator GTI = gep_type_begin(Instr); + auto &DL = Ext->getModule()->getDataLayout(); + std::advance(GTI, U.getOperandNo()); + Type *IdxTy = *GTI; + // This extension will end up with a shift because of the scaling factor. + // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. + // Get the shift amount based on the scaling factor: + // log2(sizeof(IdxTy)) - log2(8). + uint64_t ShiftAmt = + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; + // Is the constant foldable in the shift of the addressing mode? + // I.e., shift amount is between 1 and 4 inclusive. + if (ShiftAmt == 0 || ShiftAmt > 4) + return false; + break; + } + case Instruction::Trunc: + // Check if this is a noop. + // trunc(sext ty1 to ty2) to ty1. + if (Instr->getType() == Ext->getOperand(0)->getType()) + continue; + // FALL THROUGH. + default: + return false; + } + + // At this point we can use the bfm family, so this extension is free + // for that use. + } + return true; +} + +bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, + unsigned &RequiredAligment) const { + if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) + return false; + // Cyclone supports unaligned accesses. + RequiredAligment = 0; + unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); + return NumBits == 32 || NumBits == 64; +} + +bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, + unsigned &RequiredAligment) const { + if (!LoadedType.isSimple() || + (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) + return false; + // Cyclone supports unaligned accesses. + RequiredAligment = 0; + unsigned NumBits = LoadedType.getSizeInBits(); + return NumBits == 32 || NumBits == 64; +} + +/// \brief Lower an interleaved load into a ldN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 +bool AArch64TargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!Shuffles.empty() && "Empty shufflevector input"); + assert(Shuffles.size() == Indices.size() && + "Unmatched number of shufflevectors and indices"); + + const DataLayout &DL = LI->getModule()->getDataLayout(); + + VectorType *VecTy = Shuffles[0]->getType(); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) + return false; + + // A pointer vector can not be the return type of the ldN intrinsics. Need to + // load integer vectors first and then convert to pointer vectors. + Type *EltTy = VecTy->getVectorElementType(); + if (EltTy->isPointerTy()) + VecTy = + VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + + Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); + Type *Tys[2] = {VecTy, PtrTy}; + static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + Function *LdNFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + + IRBuilder<> Builder(LI); + Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy); + + CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN"); + + // Replace uses of each shufflevector with the corresponding vector loaded + // by ldN. + for (unsigned i = 0; i < Shuffles.size(); i++) { + ShuffleVectorInst *SVI = Shuffles[i]; + unsigned Index = Indices[i]; + + Value *SubVec = Builder.CreateExtractValue(LdN, Index); + + // Convert the integer vector to pointer vector if the element is pointer. + if (EltTy->isPointerTy()) + SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType()); + + SVI->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// \brief Get a mask consisting of sequential integers starting from \p Start. +/// +/// I.e. <Start, Start + 1, ..., Start + NumElts - 1> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start, + unsigned NumElts) { + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < NumElts; i++) + Mask.push_back(Builder.getInt32(Start + i)); + + return ConstantVector::get(Mask); +} + +/// \brief Lower an interleaved store into a stN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// st3 instruction in CodeGen. +bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + + VectorType *VecTy = SVI->getType(); + assert(VecTy->getVectorNumElements() % Factor == 0 && + "Invalid interleaved store"); + + unsigned NumSubElts = VecTy->getVectorNumElements() / Factor; + Type *EltTy = VecTy->getVectorElementType(); + VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) + return false; + + Value *Op0 = SVI->getOperand(0); + Value *Op1 = SVI->getOperand(1); + IRBuilder<> Builder(SI); + + // StN intrinsics don't support pointer vectors as arguments. Convert pointer + // vectors to integer vectors. + if (EltTy->isPointerTy()) { + Type *IntTy = DL.getIntPtrType(EltTy); + unsigned NumOpElts = + dyn_cast<VectorType>(Op0->getType())->getVectorNumElements(); + + // Convert to the corresponding integer vector. + Type *IntVecTy = VectorType::get(IntTy, NumOpElts); + Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); + Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); + + SubVecTy = VectorType::get(IntTy, NumSubElts); + } + + Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); + Type *Tys[2] = {SubVecTy, PtrTy}; + static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + Function *StNFunc = + Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + + SmallVector<Value *, 5> Ops; + + // Split the shufflevector operands into sub vectors for the new stN call. + for (unsigned i = 0; i < Factor; i++) + Ops.push_back(Builder.CreateShuffleVector( + Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts))); + + Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy)); + Builder.CreateCall(StNFunc, Ops); + return true; +} + +static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, + unsigned AlignCheck) { + return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && + (DstAlign == 0 || DstAlign % AlignCheck == 0)); +} + +EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // Don't use AdvSIMD to implement 16-byte memset. It would have taken one + // instruction to materialize the v2i64 zero and one store (with restrictive + // addressing mode). Just do two i64 store of zero-registers. + bool Fast; + const Function *F = MF.getFunction(); + if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && + !F->hasFnAttribute(Attribute::NoImplicitFloat) && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) + return MVT::f128; + + if (Size >= 8 && + (memOpAlign(SrcAlign, DstAlign, 8) || + (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) + return MVT::i64; + + if (Size >= 4 && + (memOpAlign(SrcAlign, DstAlign, 4) || + (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) + return MVT::i32; + + return MVT::Other; +} + +// 12-bit optionally shifted immediates are legal for adds. +bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { + if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) + return true; + return false; +} + +// Integer comparisons are implemented with ADDS/SUBS, so the range of valid +// immediates is the same as for an add or a sub. +bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { + if (Immed < 0) + Immed *= -1; + return isLegalAddImmediate(Immed); +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // AArch64 has five basic addressing modes: + // reg + // reg + 9-bit signed offset + // reg + SIZE_IN_BYTES * 12-bit unsigned offset + // reg1 + reg2 + // reg + SIZE_IN_BYTES * reg + + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // No reg+reg+imm addressing. + if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) + return false; + + // check reg + imm case: + // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 + uint64_t NumBytes = 0; + if (Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (!AM.Scale) { + int64_t Offset = AM.BaseOffs; + + // 9-bit signed offset + if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) + return true; + + // 12-bit unsigned offset + unsigned shift = Log2_64(NumBytes); + if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && + // Must be a multiple of NumBytes (NumBytes is a power of 2) + (Offset >> shift) << shift == Offset) + return true; + return false; + } + + // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 + + if (!AM.Scale || AM.Scale == 1 || + (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) + return true; + return false; +} + +int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // Scaling factors are not free at all. + // Operands | Rt Latency + // ------------------------------------------- + // Rt, [Xn, Xm] | 4 + // ------------------------------------------- + // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 + // Rt, [Xn, Wm, <extend> #imm] | + if (isLegalAddressingMode(DL, AM, Ty, AS)) + // Scale represents reg2 * scale, thus account for 1 if + // it is not equal to 0 or 1. + return AM.Scale != 0 && AM.Scale != 1; + return -1; +} + +bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +const MCPhysReg * +AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { + // LR is a callee-save register, but we must treat it as clobbered by any call + // site. Hence we include LR in the scratch registers, which are in turn added + // as implicit-defs for stackmaps and patchpoints. + static const MCPhysReg ScratchRegs[] = { + AArch64::X16, AArch64::X17, AArch64::LR, 0 + }; + return ScratchRegs; +} + +bool +AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { + EVT VT = N->getValueType(0); + // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine + // it with shift to let it be lowered to UBFX. + if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && + isa<ConstantSDNode>(N->getOperand(1))) { + uint64_t TruncMask = N->getConstantOperandVal(1); + if (isMask_64(TruncMask) && + N->getOperand(0).getOpcode() == ISD::SRL && + isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) + return false; + } + return true; +} + +bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return false; + + int64_t Val = Imm.getSExtValue(); + if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) + return true; + + if ((int64_t)Val < 0) + Val = ~Val; + if (BitSize == 32) + Val &= (1LL << 32) - 1; + + unsigned LZ = countLeadingZeros((uint64_t)Val); + unsigned Shift = (63 - LZ) / 16; + // MOVZ is free so return true for one or fewer MOVK. + return Shift < 3; +} + +// Generate SUBS and CSEL for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CSEL. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + N0.getOperand(0)); + // Generate SUBS & CSEL. + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), + N0.getOperand(0), DAG.getConstant(0, DL, VT)); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, + DAG.getConstant(AArch64CC::PL, DL, MVT::i32), + SDValue(Cmp.getNode(), 1)); + } + return SDValue(); +} + +// performXorCombine - Attempts to handle integer ABS. +static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + return performIntegerAbsCombine(N, DAG); +} + +SDValue +AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + std::vector<SDNode *> *Created) const { + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + if ((VT != MVT::i32 && VT != MVT::i64) || + !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + unsigned Lg2 = Divisor.countTrailingZeros(); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); + + // Add (N0 < 0) ? Pow2 - 1 : 0; + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); + + if (Created) { + Created->push_back(Cmp.getNode()); + Created->push_back(Add.getNode()); + Created->push_back(CSel.getNode()); + } + + // Divide by pow2. + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (Divisor.isNonNegative()) + return SRA; + + if (Created) + Created->push_back(SRA.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); +} + +static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Multiplication of a power of two plus/minus one can be done more + // cheaply as as shift+add/sub. For now, this is true unilaterally. If + // future CPUs have a cheaper MADD instruction, this may need to be + // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and + // 64-bit is 5 cycles, so this is always a win. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + APInt Value = C->getAPIntValue(); + EVT VT = N->getValueType(0); + SDLoc DL(N); + if (Value.isNonNegative()) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + APInt VM1 = Value - 1; + if (VM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VM1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, + N->getOperand(0)); + } + // (mul x, 2^N - 1) => (sub (shl x, N), x) + APInt VP1 = Value + 1; + if (VP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VP1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::SUB, DL, VT, ShiftedVal, + N->getOperand(0)); + } + } else { + // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) + APInt VNP1 = -Value + 1; + if (VNP1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VNP1.logBase2(), DL, MVT::i64)); + return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), + ShiftedVal); + } + // (mul x, -(2^N + 1)) => - (add (shl x, N), x) + APInt VNM1 = -Value - 1; + if (VNM1.isPowerOf2()) { + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(VNM1.logBase2(), DL, MVT::i64)); + SDValue Add = + DAG.getNode(ISD::ADD, DL, VT, ShiftedVal, N->getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Add); + } + } + } + return SDValue(); +} + +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation, the operand of the + // unary operation isn't a bitwise AND, or if the sizes of the operations + // aren't the same. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || + VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) + return SDValue(); + + // Now check that the other operand of the AND is a constant. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); + return Res; + } + + return SDValue(); +} + +static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // First try to optimize away the conversion when it's conditionally from + // a constant. Vectors only. + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) + return Res; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Only optimize when the source and destination types have the same width. + if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) + return SDValue(); + + // If the result of an integer load is only used by an integer-to-float + // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. + // This eliminates an "integer-to-vector-move" UOP and improves throughput. + SDValue N0 = N->getOperand(0); + if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + // Do not change the width of a volatile load. + !cast<LoadSDNode>(N0)->isVolatile()) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0); + SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), + LN0->getPointerInfo(), LN0->isVolatile(), + LN0->isNonTemporal(), LN0->isInvariant(), + LN0->getAlignment()); + + // Make sure successors of the original load stay after it by updating them + // to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); + + unsigned Opcode = + (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; + return DAG.getNode(Opcode, SDLoc(N), VT, Load); + } + + return SDValue(); +} + +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + +/// Fold a floating-point divide by power of two into fixed-point to +/// floating-point conversion. +static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned Opc = Op->getOpcode(); + if (!Op.getValueType().isVector() || + (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + int32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + int32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., i64 -> float). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); + if (C == -1 || C == 0 || C > FloatBits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + SDValue ConvInput = Op.getOperand(0); + bool IsSigned = Opc == ISD::SINT_TO_FP; + if (IntBits < FloatBits) + ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + ResTy, ConvInput); + + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp + : Intrinsic::aarch64_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, + DAG.getConstant(C, DL, MVT::i32)); +} + +/// An EXTR instruction is made up of two shifts, ORed together. This helper +/// searches for and classifies those shifts. +static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, + bool &FromHi) { + if (N.getOpcode() == ISD::SHL) + FromHi = false; + else if (N.getOpcode() == ISD::SRL) + FromHi = true; + else + return false; + + if (!isa<ConstantSDNode>(N.getOperand(1))) + return false; + + ShiftAmount = N->getConstantOperandVal(1); + Src = N->getOperand(0); + return true; +} + +/// EXTR instruction extracts a contiguous chunk of bits from two existing +/// registers viewed as a high/low pair. This function looks for the pattern: +/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an +/// EXTR. Can't quite be done in TableGen because the two immediates aren't +/// independent. +static SDValue tryCombineToEXTR(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + EVT VT = N->getValueType(0); + + assert(N->getOpcode() == ISD::OR && "Unexpected root"); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + SDValue LHS; + uint32_t ShiftLHS = 0; + bool LHSFromHi = 0; + if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) + return SDValue(); + + SDValue RHS; + uint32_t ShiftRHS = 0; + bool RHSFromHi = 0; + if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) + return SDValue(); + + // If they're both trying to come from the high part of the register, they're + // not really an EXTR. + if (LHSFromHi == RHSFromHi) + return SDValue(); + + if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) + return SDValue(); + + if (LHSFromHi) { + std::swap(LHS, RHS); + std::swap(ShiftLHS, ShiftRHS); + } + + return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, + DAG.getConstant(ShiftRHS, DL, MVT::i64)); +} + +static SDValue tryCombineToBSL(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + // We only have to look for constant vectors here since the general, variable + // case can be handled in TableGen. + unsigned Bits = VT.getVectorElementType().getSizeInBits(); + uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); + for (int i = 1; i >= 0; --i) + for (int j = 1; j >= 0; --j) { + BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); + BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); + if (!BVN0 || !BVN1) + continue; + + bool FoundMatch = true; + for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { + ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); + ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); + if (!CN0 || !CN1 || + CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { + FoundMatch = false; + break; + } + } + + if (FoundMatch) + return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + + return SDValue(); +} + +static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) + if (!EnableAArch64ExtrGeneration) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDValue Res = tryCombineToEXTR(N, DCI); + if (Res.getNode()) + return Res; + + Res = tryCombineToBSL(N, DCI); + if (Res.getNode()) + return Res; + + return SDValue(); +} + +static SDValue performBitcastCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Remove extraneous bitcasts around an extract_subvector. + // For example, + // (v4i16 (bitconvert + // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) + // becomes + // (extract_subvector ((v8i16 ...), (i64 4))) + + // Only interested in 64-bit vectors as the ultimate result. + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + if (VT.getSimpleVT().getSizeInBits() != 64) + return SDValue(); + // Is the operand an extract_subvector starting at the beginning or halfway + // point of the vector? A low half may also come through as an + // EXTRACT_SUBREG, so look for that, too. + SDValue Op0 = N->getOperand(0); + if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && + !(Op0->isMachineOpcode() && + Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) + return SDValue(); + uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); + if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { + if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) + return SDValue(); + } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { + if (idx != AArch64::dsub) + return SDValue(); + // The dsub reference is equivalent to a lane zero subvector reference. + idx = 0; + } + // Look through the bitcast of the input to the extract. + if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) + return SDValue(); + SDValue Source = Op0->getOperand(0)->getOperand(0); + // If the source type has twice the number of elements as our destination + // type, we know this is an extract of the high or low half of the vector. + EVT SVT = Source->getValueType(0); + if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) + return SDValue(); + + DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); + + // Create the simplified form to just extract the low or high half of the + // vector directly rather than bothering with the bitcasts. + SDLoc dl(N); + unsigned NumElements = VT.getVectorNumElements(); + if (idx) { + SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); + } else { + SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); + return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, + Source, SubReg), + 0); + } +} + +static SDValue performConcatVectorsCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); + + // Optimize concat_vectors of truncated vectors, where the intermediate + // type is illegal, to avoid said illegality, e.g., + // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), + // (v2i16 (truncate (v2i64))))) + // -> + // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), + // (v4i32 (bitcast (v2i64))), + // <0, 2, 4, 6>))) + // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed + // on both input and result type, so we might generate worse code. + // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. + if (N->getNumOperands() == 2 && + N0->getOpcode() == ISD::TRUNCATE && + N1->getOpcode() == ISD::TRUNCATE) { + SDValue N00 = N0->getOperand(0); + SDValue N10 = N1->getOperand(0); + EVT N00VT = N00.getValueType(); + + if (N00VT == N10.getValueType() && + (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && + N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { + MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); + SmallVector<int, 8> Mask(MidVT.getVectorNumElements()); + for (size_t i = 0; i < Mask.size(); ++i) + Mask[i] = i * 2; + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getVectorShuffle( + MidVT, dl, + DAG.getNode(ISD::BITCAST, dl, MidVT, N00), + DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); + } + } + + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector + // splat. The indexed instructions are going to be expecting a DUPLANE64, so + // canonicalise to that. + if (N0 == N1 && VT.getVectorNumElements() == 2) { + assert(VT.getVectorElementType().getSizeInBits() == 64); + return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), + DAG.getConstant(0, dl, MVT::i64)); + } + + // Canonicalise concat_vectors so that the right-hand vector has as few + // bit-casts as possible before its real operation. The primary matching + // destination for these operations will be the narrowing "2" instructions, + // which depend on the operation being performed on this right-hand vector. + // For example, + // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) + // becomes + // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) + + if (N1->getOpcode() != ISD::BITCAST) + return SDValue(); + SDValue RHS = N1->getOperand(0); + MVT RHSTy = RHS.getValueType().getSimpleVT(); + // If the RHS is not a vector, this is not the pattern we're looking for. + if (!RHSTy.isVector()) + return SDValue(); + + DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); + + MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), + RHSTy.getVectorNumElements() * 2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, + DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), + RHS)); +} + +static SDValue tryCombineFixedPointConvert(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + // Transform a scalar conversion of a value from a lane extract into a + // lane extract of a vector conversion. E.g., from foo1 to foo2: + // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } + // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } + // + // The second form interacts better with instruction selection and the + // register allocator to avoid cross-class register copies that aren't + // coalescable due to a lane reference. + + // Check the operand and see if it originates from a lane extract. + SDValue Op1 = N->getOperand(1); + if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // Yep, no additional predication needed. Perform the transform. + SDValue IID = N->getOperand(0); + SDValue Shift = N->getOperand(2); + SDValue Vec = Op1.getOperand(0); + SDValue Lane = Op1.getOperand(1); + EVT ResTy = N->getValueType(0); + EVT VecResTy; + SDLoc DL(N); + + // The vector width should be 128 bits by the time we get here, even + // if it started as 64 bits (the extract_vector handling will have + // done so). + assert(Vec.getValueType().getSizeInBits() == 128 && + "unexpected vector size on extract_vector_elt!"); + if (Vec.getValueType() == MVT::v4i32) + VecResTy = MVT::v4f32; + else if (Vec.getValueType() == MVT::v2i64) + VecResTy = MVT::v2f64; + else + llvm_unreachable("unexpected vector type!"); + + SDValue Convert = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); + } + return SDValue(); +} + +// AArch64 high-vector "long" operations are formed by performing the non-high +// version on an extract_subvector of each operand which gets the high half: +// +// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) +// +// However, there are cases which don't have an extract_high explicitly, but +// have another operation that can be made compatible with one for free. For +// example: +// +// (dupv64 scalar) --> (extract_high (dup128 scalar)) +// +// This routine does the actual conversion of such DUPs, once outer routines +// have determined that everything else is in order. +// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold +// similarly here. +static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { + switch (N.getOpcode()) { + case AArch64ISD::DUP: + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: + case AArch64ISD::MOVI: + case AArch64ISD::MOVIshift: + case AArch64ISD::MOVIedit: + case AArch64ISD::MOVImsl: + case AArch64ISD::MVNIshift: + case AArch64ISD::MVNImsl: + break; + default: + // FMOV could be supported, but isn't very useful, as it would only occur + // if you passed a bitcast' floating point immediate to an eligible long + // integer op (addl, smull, ...). + return SDValue(); + } + + MVT NarrowTy = N.getSimpleValueType(); + if (!NarrowTy.is64BitVector()) + return SDValue(); + + MVT ElementTy = NarrowTy.getVectorElementType(); + unsigned NumElems = NarrowTy.getVectorNumElements(); + MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); + + SDLoc dl(N); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, + DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), + DAG.getConstant(NumElems, dl, MVT::i64)); +} + +static bool isEssentiallyExtractSubvector(SDValue N) { + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) + return true; + + return N.getOpcode() == ISD::BITCAST && + N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; +} + +/// \brief Helper structure to keep track of ISD::SET_CC operands. +struct GenericSetCCInfo { + const SDValue *Opnd0; + const SDValue *Opnd1; + ISD::CondCode CC; +}; + +/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. +struct AArch64SetCCInfo { + const SDValue *Cmp; + AArch64CC::CondCode CC; +}; + +/// \brief Helper structure to keep track of SetCC information. +union SetCCInfo { + GenericSetCCInfo Generic; + AArch64SetCCInfo AArch64; +}; + +/// \brief Helper structure to be able to read SetCC information. If set to +/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a +/// GenericSetCCInfo. +struct SetCCInfoAndKind { + SetCCInfo Info; + bool IsAArch64; +}; + +/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or +/// an +/// AArch64 lowered one. +/// \p SetCCInfo is filled accordingly. +/// \post SetCCInfo is meanginfull only when this function returns true. +/// \return True when Op is a kind of SET_CC operation. +static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { + // If this is a setcc, this is straight forward. + if (Op.getOpcode() == ISD::SETCC) { + SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); + SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); + SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + SetCCInfo.IsAArch64 = false; + return true; + } + // Otherwise, check if this is a matching csel instruction. + // In other words: + // - csel 1, 0, cc + // - csel 0, 1, !cc + if (Op.getOpcode() != AArch64ISD::CSEL) + return false; + // Set the information about the operands. + // TODO: we want the operands of the Cmp not the csel + SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); + SetCCInfo.IsAArch64 = true; + SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( + cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + + // Check that the operands matches the constraints: + // (1) Both operands must be constants. + // (2) One must be 1 and the other must be 0. + ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); + ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + + // Check (1). + if (!TValue || !FValue) + return false; + + // Check (2). + if (!TValue->isOne()) { + // Update the comparison when we are interested in !cc. + std::swap(TValue, FValue); + SetCCInfo.Info.AArch64.CC = + AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); + } + return TValue->isOne() && FValue->isNullValue(); +} + +// Returns true if Op is setcc or zext of setcc. +static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { + if (isSetCC(Op, Info)) + return true; + return ((Op.getOpcode() == ISD::ZERO_EXTEND) && + isSetCC(Op->getOperand(0), Info)); +} + +// The folding we want to perform is: +// (add x, [zext] (setcc cc ...) ) +// --> +// (csel x, (add x, 1), !cc ...) +// +// The latter will get matched to a CSINC instruction. +static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { + assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); + SDValue LHS = Op->getOperand(0); + SDValue RHS = Op->getOperand(1); + SetCCInfoAndKind InfoAndKind; + + // If neither operand is a SET_CC, give up. + if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { + std::swap(LHS, RHS); + if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) + return SDValue(); + } + + // FIXME: This could be generatized to work for FP comparisons. + EVT CmpVT = InfoAndKind.IsAArch64 + ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() + : InfoAndKind.Info.Generic.Opnd0->getValueType(); + if (CmpVT != MVT::i32 && CmpVT != MVT::i64) + return SDValue(); + + SDValue CCVal; + SDValue Cmp; + SDLoc dl(Op); + if (InfoAndKind.IsAArch64) { + CCVal = DAG.getConstant( + AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, + MVT::i32); + Cmp = *InfoAndKind.Info.AArch64.Cmp; + } else + Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, + *InfoAndKind.Info.Generic.Opnd1, + ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), + CCVal, DAG, dl); + + EVT VT = Op->getValueType(0); + LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); +} + +// The basic add/sub long vector instructions have variants with "2" on the end +// which act on the high-half of their inputs. They are normally matched by +// patterns like: +// +// (add (zeroext (extract_high LHS)), +// (zeroext (extract_high RHS))) +// -> uaddl2 vD, vN, vM +// +// However, if one of the extracts is something like a duplicate, this +// instruction can still be used profitably. This function puts the DAG into a +// more appropriate form for those patterns to trigger. +static SDValue performAddSubLongCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector()) { + if (N->getOpcode() == ISD::ADD) + return performSetccAddFolding(N, DAG); + return SDValue(); + } + + // Make sure both branches are extended in the same way. + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if ((LHS.getOpcode() != ISD::ZERO_EXTEND && + LHS.getOpcode() != ISD::SIGN_EXTEND) || + LHS.getOpcode() != RHS.getOpcode()) + return SDValue(); + + unsigned ExtType = LHS.getOpcode(); + + // It's not worth doing if at least one of the inputs isn't already an + // extract, but we don't know which it'll be so we have to try both. + if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { + RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); + if (!RHS.getNode()) + return SDValue(); + + RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); + } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { + LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); + if (!LHS.getNode()) + return SDValue(); + + LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); + } + + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); +} + +// Massage DAGs which we can use the high-half "long" operations on into +// something isel will recognize better. E.g. +// +// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> +// (aarch64_neon_umull (extract_high (v2i64 vec))) +// (extract_high (v2i64 (dup128 scalar))))) +// +static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + assert(LHS.getValueType().is64BitVector() && + RHS.getValueType().is64BitVector() && + "unexpected shape for long operation"); + + // Either node could be a DUP, but it's not worth doing both of them (you'd + // just as well use the non-high version) so look for a corresponding extract + // operation on the other "wing". + if (isEssentiallyExtractSubvector(LHS)) { + RHS = tryExtendDUPToExtractHigh(RHS, DAG); + if (!RHS.getNode()) + return SDValue(); + } else if (isEssentiallyExtractSubvector(RHS)) { + LHS = tryExtendDUPToExtractHigh(LHS, DAG); + if (!LHS.getNode()) + return SDValue(); + } + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), + N->getOperand(0), LHS, RHS); +} + +static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { + MVT ElemTy = N->getSimpleValueType(0).getScalarType(); + unsigned ElemBits = ElemTy.getSizeInBits(); + + int64_t ShiftAmount; + if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs, ElemBits) || + SplatBitSize != ElemBits) + return SDValue(); + + ShiftAmount = SplatValue.getSExtValue(); + } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { + ShiftAmount = CVN->getSExtValue(); + } else + return SDValue(); + + unsigned Opcode; + bool IsRightShift; + switch (IID) { + default: + llvm_unreachable("Unknown shift intrinsic"); + case Intrinsic::aarch64_neon_sqshl: + Opcode = AArch64ISD::SQSHL_I; + IsRightShift = false; + break; + case Intrinsic::aarch64_neon_uqshl: + Opcode = AArch64ISD::UQSHL_I; + IsRightShift = false; + break; + case Intrinsic::aarch64_neon_srshl: + Opcode = AArch64ISD::SRSHR_I; + IsRightShift = true; + break; + case Intrinsic::aarch64_neon_urshl: + Opcode = AArch64ISD::URSHR_I; + IsRightShift = true; + break; + case Intrinsic::aarch64_neon_sqshlu: + Opcode = AArch64ISD::SQSHLU_I; + IsRightShift = false; + break; + } + + if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { + SDLoc dl(N); + return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), + DAG.getConstant(-ShiftAmount, dl, MVT::i32)); + } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { + SDLoc dl(N); + return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), + DAG.getConstant(ShiftAmount, dl, MVT::i32)); + } + + return SDValue(); +} + +// The CRC32[BH] instructions ignore the high bits of their data operand. Since +// the intrinsics must be legal and take an i32, this means there's almost +// certainly going to be a zext in the DAG which we can eliminate. +static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { + SDValue AndN = N->getOperand(2); + if (AndN.getOpcode() != ISD::AND) + return SDValue(); + + ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); + if (!CMask || CMask->getZExtValue() != Mask) + return SDValue(); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, + N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); +} + +static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, + SelectionDAG &DAG) { + SDLoc dl(N); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), + DAG.getNode(Opc, dl, + N->getOperand(1).getSimpleValueType(), + N->getOperand(1)), + DAG.getConstant(0, dl, MVT::i64)); +} + +static SDValue performIntrinsicCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + unsigned IID = getIntrinsicID(N); + switch (IID) { + default: + break; + case Intrinsic::aarch64_neon_vcvtfxs2fp: + case Intrinsic::aarch64_neon_vcvtfxu2fp: + return tryCombineFixedPointConvert(N, DCI, DAG); + case Intrinsic::aarch64_neon_saddv: + return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); + case Intrinsic::aarch64_neon_uaddv: + return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); + case Intrinsic::aarch64_neon_sminv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); + case Intrinsic::aarch64_neon_uminv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); + case Intrinsic::aarch64_neon_smaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); + case Intrinsic::aarch64_neon_umaxv: + return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); + case Intrinsic::aarch64_neon_fmax: + return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmin: + return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmaxnm: + return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fminnm: + return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_sqdmull: + return tryCombineLongOpWithDup(IID, N, DCI, DAG); + case Intrinsic::aarch64_neon_sqshl: + case Intrinsic::aarch64_neon_uqshl: + case Intrinsic::aarch64_neon_sqshlu: + case Intrinsic::aarch64_neon_srshl: + case Intrinsic::aarch64_neon_urshl: + return tryCombineShiftImm(IID, N, DAG); + case Intrinsic::aarch64_crc32b: + case Intrinsic::aarch64_crc32cb: + return tryCombineCRC32(0xff, N, DAG); + case Intrinsic::aarch64_crc32h: + case Intrinsic::aarch64_crc32ch: + return tryCombineCRC32(0xffff, N, DAG); + } + return SDValue(); +} + +static SDValue performExtendCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then + // we can convert that DUP into another extract_high (of a bigger DUP), which + // helps the backend to decide that an sabdl2 would be useful, saving a real + // extract_high operation. + if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + SDNode *ABDNode = N->getOperand(0).getNode(); + unsigned IID = getIntrinsicID(ABDNode); + if (IID == Intrinsic::aarch64_neon_sabd || + IID == Intrinsic::aarch64_neon_uabd) { + SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); + if (!NewABD.getNode()) + return SDValue(); + + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), + NewABD); + } + } + + // This is effectively a custom type legalization for AArch64. + // + // Type legalization will split an extend of a small, legal, type to a larger + // illegal type by first splitting the destination type, often creating + // illegal source types, which then get legalized in isel-confusing ways, + // leading to really terrible codegen. E.g., + // %result = v8i32 sext v8i8 %value + // becomes + // %losrc = extract_subreg %value, ... + // %hisrc = extract_subreg %value, ... + // %lo = v4i32 sext v4i8 %losrc + // %hi = v4i32 sext v4i8 %hisrc + // Things go rapidly downhill from there. + // + // For AArch64, the [sz]ext vector instructions can only go up one element + // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 + // take two instructions. + // + // This implies that the most efficient way to do the extend from v8i8 + // to two v4i32 values is to first extend the v8i8 to v8i16, then do + // the normal splitting to happen for the v8i16->v8i32. + + // This is pre-legalization to catch some cases where the default + // type legalization will create ill-tempered code. + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + // We're only interested in cleaning things up for non-legal vector types + // here. If both the source and destination are legal, things will just + // work naturally without any fiddling. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT ResVT = N->getValueType(0); + if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) + return SDValue(); + // If the vector type isn't a simple VT, it's beyond the scope of what + // we're worried about here. Let legalization do its thing and hope for + // the best. + SDValue Src = N->getOperand(0); + EVT SrcVT = Src->getValueType(0); + if (!ResVT.isSimple() || !SrcVT.isSimple()) + return SDValue(); + + // If the source VT is a 64-bit vector, we can play games and get the + // better results we want. + if (SrcVT.getSizeInBits() != 64) + return SDValue(); + + unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); + unsigned ElementCount = SrcVT.getVectorNumElements(); + SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); + SDLoc DL(N); + Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); + + // Now split the rest of the operation into two halves, each with a 64 + // bit source. + EVT LoVT, HiVT; + SDValue Lo, Hi; + unsigned NumElements = ResVT.getVectorNumElements(); + assert(!(NumElements & 1) && "Splitting vector, but not in half!"); + LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), + ResVT.getVectorElementType(), NumElements / 2); + + EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), + LoVT.getVectorNumElements()); + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, + DAG.getConstant(0, DL, MVT::i64)); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, + DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); + + // Now combine the parts back together so we still have a single result + // like the combiner expects. + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); +} + +/// Replace a splat of a scalar to a vector store by scalar stores of the scalar +/// value. The load store optimizer pass will merge them to store pair stores. +/// This has better performance than a splat of the scalar followed by a split +/// vector store. Even if the stores are not merged it is four stores vs a dup, +/// followed by an ext.b and two stores. +static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + // Don't replace floating point stores, they possibly won't be transformed to + // stp because of the store pair suppress pass. + if (VT.isFloatingPoint()) + return SDValue(); + + // Check for insert vector elements. + if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + // We can express a splat as store pair(s) for 2 or 4 elements. + unsigned NumVecElts = VT.getVectorNumElements(); + if (NumVecElts != 4 && NumVecElts != 2) + return SDValue(); + SDValue SplatVal = StVal.getOperand(1); + unsigned RemainInsertElts = NumVecElts - 1; + + // Check that this is a splat. + while (--RemainInsertElts) { + SDValue NextInsertElt = StVal.getOperand(0); + if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + if (NextInsertElt.getOperand(1) != SplatVal) + return SDValue(); + StVal = NextInsertElt; + } + unsigned OrigAlignment = St->getAlignment(); + unsigned EltOffset = NumVecElts == 4 ? 4 : 8; + unsigned Alignment = std::min(OrigAlignment, EltOffset); + + // Create scalar stores. This is at least as good as the code sequence for a + // split unaligned store which is a dup.s, ext.b, and two stores. + // Most of the time the three stores should be replaced by store pair + // instructions (stp). + SDLoc DL(St); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = + DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), St->getAlignment()); + + unsigned Offset = EltOffset; + while (--NumVecElts) { + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(Offset, DL, MVT::i64)); + NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), Alignment); + Offset += EltOffset; + } + return NewST1; +} + +static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *S = cast<StoreSDNode>(N); + if (S->isVolatile()) + return SDValue(); + + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + + // Cyclone has bad performance on unaligned 16B stores when crossing line and + // page boundaries. We want to split such stores. + if (!Subtarget->isCyclone()) + return SDValue(); + + // Don't split at -Oz. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue StVal = S->getValue(); + EVT VT = StVal.getValueType(); + + // Don't split v2i64 vectors. Memcpy lowering produces those and splitting + // those up regresses performance on micro-benchmarks and olden/bh. + if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) + return SDValue(); + + // Split unaligned 16B stores. They are terrible for performance. + // Don't split stores with alignment of 1 or 2. Code that uses clang vector + // extensions can use this to mark that it does not want splitting to happen + // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of + // eliminating alignment hazards is only 1 in 8 for alignment of 2. + if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || + S->getAlignment() <= 2) + return SDValue(); + + // If we get a splat of a scalar convert this vector store to a store of + // scalars. They will be merged into store pairs thereby removing two + // instructions. + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) + return ReplacedSplat; + + SDLoc DL(S); + unsigned NumElts = VT.getVectorNumElements() / 2; + // Split VT into two. + EVT HalfVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, + DAG.getConstant(0, DL, MVT::i64)); + SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, + DAG.getConstant(NumElts, DL, MVT::i64)); + SDValue BasePtr = S->getBasePtr(); + SDValue NewST1 = + DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), + S->isVolatile(), S->isNonTemporal(), S->getAlignment()); + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(8, DL, MVT::i64)); + return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, + S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), + S->getAlignment()); +} + +/// Target-specific DAG combine function for post-increment LD1 (lane) and +/// post-increment LD1R. +static SDValue performPostLD1Combine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + bool IsLaneOp) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + unsigned LoadIdx = IsLaneOp ? 1 : 0; + SDNode *LD = N->getOperand(LoadIdx).getNode(); + // If it is not LOAD, can not do such combine. + if (LD->getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); + EVT MemVT = LoadSDN->getMemoryVT(); + // Check if memory operand is the same type as the vector element. + if (MemVT != VT.getVectorElementType()) + return SDValue(); + + // Check if there are other uses. If so, do not combine as it will introduce + // an extra load. + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; + ++UI) { + if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. + continue; + if (*UI != N) + return SDValue(); + } + + SDValue Addr = LD->getOperand(1); + SDValue Vector = N->getOperand(0); + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = + Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD + || UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load. Otherwise, folding it + // would create a cycle. + if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) + continue; + // Also check that add is not used in the vector operand. This would also + // create a cycle. + if (User->isPredecessorOf(Vector.getNode())) + continue; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = VT.getScalarSizeInBits() / 8; + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(AArch64::XZR, MVT::i64); + } + + // Finally, check that the vector doesn't depend on the load. + // Again, this would create a cycle. + // The load depending on the vector is fine, as that's the case for the + // LD1*post we'll eventually generate anyway. + if (LoadSDN->isPredecessorOf(Vector.getNode())) + continue; + + SmallVector<SDValue, 8> Ops; + Ops.push_back(LD->getOperand(0)); // Chain + if (IsLaneOp) { + Ops.push_back(Vector); // The vector to be inserted + Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector + } + Ops.push_back(Addr); + Ops.push_back(Inc); + + EVT Tys[3] = { VT, MVT::i64, MVT::Other }; + SDVTList SDTys = DAG.getVTList(Tys); + unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, + MemVT, + LoadSDN->getMemOperand()); + + // Update the uses. + SmallVector<SDValue, 2> NewResults; + NewResults.push_back(SDValue(LD, 0)); // The result of load + NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + DCI.CombineTo(LD, NewResults); + DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result + DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register + + break; + } + return SDValue(); +} + +/// Simplify \Addr given that the top byte of it is ignored by HW during +/// address translation. +static bool performTBISimplification(SDValue Addr, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + APInt DemandedMask = APInt::getLowBitsSet(64, 56); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return true; + } + return false; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Split = split16BStores(N, DCI, DAG, Subtarget); + if (Split.getNode()) + return Split; + + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(2), DCI, DAG)) + return SDValue(N, 0); + + return SDValue(); +} + + /// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the <M, U> form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} + +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue performNEONPostLDSTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + unsigned AddrOpIdx = N->getNumOperands() - 1; + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool IsStore = false; + bool IsLaneOp = false; + bool IsDupOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; + NumVecs = 2; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; + NumVecs = 3; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; + NumVecs = 4; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; + NumVecs = 2; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; + NumVecs = 3; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; + NumVecs = 4; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; + NumVecs = 2; IsStore = true; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; + NumVecs = 3; IsStore = true; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; + NumVecs = 4; IsStore = true; IsLaneOp = true; break; + } + + EVT VecTy; + if (IsStore) + VecTy = N->getOperand(2).getValueType(); + else + VecTy = N->getValueType(0); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (IsLaneOp || IsDupOp) + NumBytes /= VecTy.getVectorNumElements(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(AArch64::XZR, MVT::i64); + } + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // Incoming chain + // Load lane and store have vector list as input. + if (IsLaneOp || IsStore) + for (unsigned i = 2; i < AddrOpIdx; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Addr); // Base register + Ops.push_back(Inc); + + // Return Types. + EVT Tys[6]; + unsigned NumResultVecs = (IsStore ? 0 : NumVecs); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i64; // Type of write back register + Tys[n] = MVT::Other; // Type of the chain + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + + MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector<SDValue> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +// Checks to see if the value is the prescribed width and returns information +// about its extension mode. +static +bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { + ExtType = ISD::NON_EXTLOAD; + switch(V.getNode()->getOpcode()) { + default: + return false; + case ISD::LOAD: { + LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); + if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) + || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { + ExtType = LoadNode->getExtensionType(); + return true; + } + return false; + } + case ISD::AssertSext: { + VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8 && width == 8) + || (TypeNode->getVT() == MVT::i16 && width == 16)) { + ExtType = ISD::SEXTLOAD; + return true; + } + return false; + } + case ISD::AssertZext: { + VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8 && width == 8) + || (TypeNode->getVT() == MVT::i16 && width == 16)) { + ExtType = ISD::ZEXTLOAD; + return true; + } + return false; + } + case ISD::Constant: + case ISD::TargetConstant: { + if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < + 1LL << (width - 1)) + return true; + return false; + } + } + + return true; +} + +// This function does a whole lot of voodoo to determine if the tests are +// equivalent without and with a mask. Essentially what happens is that given a +// DAG resembling: +// +// +-------------+ +-------------+ +-------------+ +-------------+ +// | Input | | AddConstant | | CompConstant| | CC | +// +-------------+ +-------------+ +-------------+ +-------------+ +// | | | | +// V V | +----------+ +// +-------------+ +----+ | | +// | ADD | |0xff| | | +// +-------------+ +----+ | | +// | | | | +// V V | | +// +-------------+ | | +// | AND | | | +// +-------------+ | | +// | | | +// +-----+ | | +// | | | +// V V V +// +-------------+ +// | CMP | +// +-------------+ +// +// The AND node may be safely removed for some combinations of inputs. In +// particular we need to take into account the extension type of the Input, +// the exact values of AddConstant, CompConstant, and CC, along with the nominal +// width of the input (this can work for any width inputs, the above graph is +// specific to 8 bits. +// +// The specific equations were worked out by generating output tables for each +// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The +// problem was simplified by working with 4 bit inputs, which means we only +// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero +// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 +// patterns present in both extensions (0,7). For every distinct set of +// AddConstant and CompConstants bit patterns we can consider the masked and +// unmasked versions to be equivalent if the result of this function is true for +// all 16 distinct bit patterns of for the current extension type of Input (w0). +// +// sub w8, w0, w1 +// and w10, w8, #0x0f +// cmp w8, w2 +// cset w9, AArch64CC +// cmp w10, w2 +// cset w11, AArch64CC +// cmp w9, w11 +// cset w0, eq +// ret +// +// Since the above function shows when the outputs are equivalent it defines +// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and +// would be expensive to run during compiles. The equations below were written +// in a test harness that confirmed they gave equivalent outputs to the above +// for all inputs function, so they can be used determine if the removal is +// legal instead. +// +// isEquivalentMaskless() is the code for testing if the AND can be removed +// factored out of the DAG recognition as the DAG can take several forms. + +static +bool isEquivalentMaskless(unsigned CC, unsigned width, + ISD::LoadExtType ExtType, signed AddConstant, + signed CompConstant) { + // By being careful about our equations and only writing the in term + // symbolic values and well known constants (0, 1, -1, MaxUInt) we can + // make them generally applicable to all bit widths. + signed MaxUInt = (1 << width); + + // For the purposes of these comparisons sign extending the type is + // equivalent to zero extending the add and displacing it by half the integer + // width. Provided we are careful and make sure our equations are valid over + // the whole range we can just adjust the input and avoid writing equations + // for sign extended inputs. + if (ExtType == ISD::SEXTLOAD) + AddConstant -= (1 << (width-1)); + + switch(CC) { + case AArch64CC::LE: + case AArch64CC::GT: { + if ((AddConstant == 0) || + (CompConstant == MaxUInt - 1 && AddConstant < 0) || + (AddConstant >= 0 && CompConstant < 0) || + (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) + return true; + } break; + case AArch64CC::LT: + case AArch64CC::GE: { + if ((AddConstant == 0) || + (AddConstant >= 0 && CompConstant <= 0) || + (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) + return true; + } break; + case AArch64CC::HI: + case AArch64CC::LS: { + if ((AddConstant >= 0 && CompConstant < 0) || + (AddConstant <= 0 && CompConstant >= -1 && + CompConstant < AddConstant + MaxUInt)) + return true; + } break; + case AArch64CC::PL: + case AArch64CC::MI: { + if ((AddConstant == 0) || + (AddConstant > 0 && CompConstant <= 0) || + (AddConstant < 0 && CompConstant <= AddConstant)) + return true; + } break; + case AArch64CC::LO: + case AArch64CC::HS: { + if ((AddConstant >= 0 && CompConstant <= 0) || + (AddConstant <= 0 && CompConstant >= 0 && + CompConstant <= AddConstant + MaxUInt)) + return true; + } break; + case AArch64CC::EQ: + case AArch64CC::NE: { + if ((AddConstant > 0 && CompConstant < 0) || + (AddConstant < 0 && CompConstant >= 0 && + CompConstant < AddConstant + MaxUInt) || + (AddConstant >= 0 && CompConstant >= 0 && + CompConstant >= AddConstant) || + (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) + + return true; + } break; + case AArch64CC::VS: + case AArch64CC::VC: + case AArch64CC::AL: + case AArch64CC::NV: + return true; + case AArch64CC::Invalid: + break; + } + + return false; +} + +static +SDValue performCONDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, unsigned CCIndex, + unsigned CmpIndex) { + unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); + SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); + unsigned CondOpcode = SubsNode->getOpcode(); + + if (CondOpcode != AArch64ISD::SUBS) + return SDValue(); + + // There is a SUBS feeding this condition. Is it fed by a mask we can + // use? + + SDNode *AndNode = SubsNode->getOperand(0).getNode(); + unsigned MaskBits = 0; + + if (AndNode->getOpcode() != ISD::AND) + return SDValue(); + + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { + uint32_t CNV = CN->getZExtValue(); + if (CNV == 255) + MaskBits = 8; + else if (CNV == 65535) + MaskBits = 16; + } + + if (!MaskBits) + return SDValue(); + + SDValue AddValue = AndNode->getOperand(0); + + if (AddValue.getOpcode() != ISD::ADD) + return SDValue(); + + // The basic dag structure is correct, grab the inputs and validate them. + + SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); + SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); + SDValue SubsInputValue = SubsNode->getOperand(1); + + // The mask is present and the provenance of all the values is a smaller type, + // lets see if the mask is superfluous. + + if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || + !isa<ConstantSDNode>(SubsInputValue.getNode())) + return SDValue(); + + ISD::LoadExtType ExtType; + + if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || + !checkValueWidth(AddInputValue2, MaskBits, ExtType) || + !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) + return SDValue(); + + if(!isEquivalentMaskless(CC, MaskBits, ExtType, + cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), + cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) + return SDValue(); + + // The AND is not necessary, remove it. + + SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), + SubsNode->getValueType(1)); + SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; + + SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); + DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); + + return SDValue(N, 0); +} + +// Optimize compare with zero and branch. +static SDValue performBRCONDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); + if (NV.getNode()) + N = NV.getNode(); + SDValue Chain = N->getOperand(0); + SDValue Dest = N->getOperand(1); + SDValue CCVal = N->getOperand(2); + SDValue Cmp = N->getOperand(3); + + assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); + unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return SDValue(); + + unsigned CmpOpc = Cmp.getOpcode(); + if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) + return SDValue(); + + // Only attempt folding if there is only one use of the flag and no use of the + // value. + if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) + return SDValue(); + + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + + assert(LHS.getValueType() == RHS.getValueType() && + "Expected the value type to be the same for both operands!"); + if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) + return SDValue(); + + if (isNullConstant(LHS)) + std::swap(LHS, RHS); + + if (!isNullConstant(RHS)) + return SDValue(); + + if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || + LHS.getOpcode() == ISD::SRL) + return SDValue(); + + // Fold the compare into the branch instruction. + SDValue BR; + if (CC == AArch64CC::EQ) + BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); + else + BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, BR, false); + + return SDValue(); +} + +// vselect (v1i1 setcc) -> +// vselect (v1iXX setcc) (XX is the size of the compared operand type) +// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as +// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine +// such VSELECT. +static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT CCVT = N0.getValueType(); + + if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || + CCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + EVT ResVT = N->getValueType(0); + EVT CmpVT = N0.getOperand(0).getValueType(); + // Only combine when the result type is of the same size as the compared + // operands. + if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) + return SDValue(); + + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + SDValue SetCC = + DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), + N0.getOperand(0), N0.getOperand(1), + cast<CondCodeSDNode>(N0.getOperand(2))->get()); + return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, + IfTrue, IfFalse); +} + +/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with +/// the compare-mask instructions rather than going via NZCV, even if LHS and +/// RHS are really scalar. This replaces any scalar setcc in the above pattern +/// with a vector one followed by a DUP shuffle on the result. +static SDValue performSelectCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT ResVT = N->getValueType(0); + + if (N0.getOpcode() != ISD::SETCC) + return SDValue(); + + // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered + // scalar SetCCResultType. We also don't expect vectors, because we assume + // that selects fed by vector SETCCs are canonicalized to VSELECT. + assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && + "Scalar-SETCC feeding SELECT has unexpected result type!"); + + // If NumMaskElts == 0, the comparison is larger than select result. The + // largest real NEON comparison is 64-bits per lane, which means the result is + // at most 32-bits and an illegal vector. Just bail out for now. + EVT SrcVT = N0.getOperand(0).getValueType(); + + // Don't try to do this optimization when the setcc itself has i1 operands. + // There are no legal vectors of i1, so this would be pointless. + if (SrcVT == MVT::i1) + return SDValue(); + + int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); + if (!ResVT.isVector() || NumMaskElts == 0) + return SDValue(); + + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); + EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); + + // Also bail out if the vector CCVT isn't the same size as ResVT. + // This can happen if the SETCC operand size doesn't divide the ResVT size + // (e.g., f64 vs v3f32). + if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) + return SDValue(); + + // Make sure we didn't create illegal types, if we're not supposed to. + assert(DCI.isBeforeLegalize() || + DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); + + // First perform a vector comparison, where lane 0 is the one we're interested + // in. + SDLoc DL(N0); + SDValue LHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); + SDValue RHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); + SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); + + // Now duplicate the comparison mask we want across all other lanes. + SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + Mask = DAG.getNode(ISD::BITCAST, DL, + ResVT.changeVectorElementTypeToInteger(), Mask); + + return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); +} + +/// Get rid of unnecessary NVCASTs (that don't change the type). +static SDValue performNVCASTCombine(SDNode *N) { + if (N->getValueType(0) == N->getOperand(0).getValueType()) + return N->getOperand(0); + + return SDValue(); +} + +SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: + break; + case ISD::ADD: + case ISD::SUB: + return performAddSubLongCombine(N, DCI, DAG); + case ISD::XOR: + return performXorCombine(N, DAG, DCI, Subtarget); + case ISD::MUL: + return performMulCombine(N, DAG, DCI, Subtarget); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); + case ISD::FDIV: + return performFDivCombine(N, DAG, Subtarget); + case ISD::OR: + return performORCombine(N, DCI, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicCombine(N, DCI, Subtarget); + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + return performExtendCombine(N, DCI, DAG); + case ISD::BITCAST: + return performBitcastCombine(N, DCI, DAG); + case ISD::CONCAT_VECTORS: + return performConcatVectorsCombine(N, DCI, DAG); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } + case ISD::VSELECT: + return performVSelectCombine(N, DCI.DAG); + case ISD::LOAD: + if (performTBISimplification(N->getOperand(1), DCI, DAG)) + return SDValue(N, 0); + break; + case ISD::STORE: + return performSTORECombine(N, DCI, DAG, Subtarget); + case AArch64ISD::BRCOND: + return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::CSEL: + return performCONDCombine(N, DCI, DAG, 2, 3); + case AArch64ISD::DUP: + return performPostLD1Combine(N, DCI, false); + case AArch64ISD::NVCAST: + return performNVCASTCombine(N); + case ISD::INSERT_VECTOR_ELT: + return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_ld1x2: + case Intrinsic::aarch64_neon_ld1x3: + case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld2lane: + case Intrinsic::aarch64_neon_ld3lane: + case Intrinsic::aarch64_neon_ld4lane: + case Intrinsic::aarch64_neon_ld2r: + case Intrinsic::aarch64_neon_ld3r: + case Intrinsic::aarch64_neon_ld4r: + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + case Intrinsic::aarch64_neon_st1x2: + case Intrinsic::aarch64_neon_st1x3: + case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st2lane: + case Intrinsic::aarch64_neon_st3lane: + case Intrinsic::aarch64_neon_st4lane: + return performNEONPostLDSTCombine(N, DCI, DAG); + default: + break; + } + } + return SDValue(); +} + +// Check if the return value is used as only a return value, as otherwise +// we can't perform a tail-call. In particular, we need to check for +// target ISD nodes that are returns and any other "odd" constructs +// that the generic analysis code won't necessarily catch. +bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, + SDValue &Chain) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == + MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode *Node : Copy->uses()) { + if (Node->getOpcode() != AArch64ISD::RET_FLAG) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +// Return whether the an instruction can potentially be optimized to a tail +// call. This will cause the optimizers to attempt to move, or duplicate, +// return instructions to help enable tail call optimizations for this +// instruction. +bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + return true; +} + +bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + bool &IsInc, + SelectionDAG &DAG) const { + if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) + return false; + + Base = Op->getOperand(0); + // All of the indexed addressing mode instructions take a signed + // 9 bit immediate offset. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { + int64_t RHSC = (int64_t)RHS->getZExtValue(); + if (RHSC >= 256 || RHSC <= -256) + return false; + IsInc = (Op->getOpcode() == ISD::ADD); + Offset = Op->getOperand(1); + return true; + } + return false; +} + +bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + EVT VT; + SDValue Ptr; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool IsInc; + if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) + return false; + AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; +} + +bool AArch64TargetLowering::getPostIndexedAddressParts( + SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { + EVT VT; + SDValue Ptr; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool IsInc; + if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) + return false; + // Post-indexing updates the base, so it's not a valid transform + // if that's not the same as the load's pointer. + if (Ptr != Base) + return false; + AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; + return true; +} + +static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op = N->getOperand(0); + + if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + return; + + Op = SDValue( + DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, + DAG.getUNDEF(MVT::i32), Op, + DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), + 0); + Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); +} + +static void ReplaceReductionResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, unsigned InterOp, + unsigned AcrossOp) { + EVT LoVT, HiVT; + SDValue Lo, Hi; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); + SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); + Results.push_back(SplitVal); +} + +void AArch64TargetLowering::ReplaceNodeResults( + SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + llvm_unreachable("Don't know how to custom expand this"); + case ISD::BITCAST: + ReplaceBITCASTResults(N, Results, DAG); + return; + case AArch64ISD::SADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); + return; + case AArch64ISD::UADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); + return; + case AArch64ISD::SMINV: + ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); + return; + case AArch64ISD::UMINV: + ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); + return; + case AArch64ISD::SMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); + return; + case AArch64ISD::UMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); + return; + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); + // Let normal code take care of it by not adding anything to Results. + return; + } +} + +bool AArch64TargetLowering::useLoadStackGuardNode() const { + return true; +} + +unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are three or more FDIVs. + return 3; +} + +TargetLoweringBase::LegalizeTypeAction +AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { + MVT SVT = VT.getSimpleVT(); + // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, + // v4i16, v2i32 instead of to promote. + if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 + || SVT == MVT::v1f32) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return Size == 128; +} + +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + unsigned Size = LI->getType()->getPrimitiveSizeInBits(); + return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; +} + +// For the real atomic operations, we have ldxr/stxr up to 128 bits, +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; +} + +bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { + return true; +} + +Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); + bool IsAcquire = isAtLeastAcquire(Ord); + + // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd + // intrinsic must return {i64, i64} and we have to recombine them into a + // single i128 here. + if (ValTy->getPrimitiveSizeInBits() == 128) { + Intrinsic::ID Int = + IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; + Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); + + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); + + Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); + Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); + } + + Type *Tys[] = { Addr->getType() }; + Intrinsic::ID Int = + IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; + Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateTruncOrBitCast( + Builder.CreateCall(Ldxr, Addr), + cast<PointerType>(Addr->getType())->getElementType()); +} + +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + +Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, + Value *Val, Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + bool IsRelease = isAtLeastRelease(Ord); + + // Since the intrinsics must have legal type, the i128 intrinsics take two + // parameters: "i64, i64". We must marshal Val into the appropriate form + // before the call. + if (Val->getType()->getPrimitiveSizeInBits() == 128) { + Intrinsic::ID Int = + IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; + Function *Stxr = Intrinsic::getDeclaration(M, Int); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); + Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); + } + + Intrinsic::ID Int = + IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; + Type *Tys[] = { Addr->getType() }; + Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateCall(Stxr, + {Builder.CreateZExtOrBitCast( + Val, Stxr->getFunctionType()->getParamType(0)), + Addr}); +} + +bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + return Ty->isArrayTy(); +} + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x48; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h new file mode 100644 index 0000000..e99616c --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -0,0 +1,560 @@ +//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that AArch64 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H + +#include "AArch64.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +namespace AArch64ISD { + +enum NodeType : unsigned { + FIRST_NUMBER = ISD::BUILTIN_OP_END, + WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. + CALL, // Function call. + + // Produces the full sequence of instructions for getting the thread pointer + // offset of a variable into X0, using the TLSDesc model. + TLSDESC_CALLSEQ, + ADRP, // Page address of a TargetGlobalAddress operand. + ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. + LOADgot, // Load from automatically generated descriptor (e.g. Global + // Offset Table, TLS record). + RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand. + BRCOND, // Conditional branch instruction; "b.cond". + CSEL, + FCSEL, // Conditional move instruction. + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + + // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on + // ELF. + THREAD_POINTER, + ADC, + SBC, // adc, sbc instructions + + // Arithmetic instructions which write flags. + ADDS, + SUBS, + ADCS, + SBCS, + ANDS, + + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + + // Floating point comparison + FCMP, + + // Scalar extract + EXTR, + + // Scalar-to-vector duplication + DUP, + DUPLANE8, + DUPLANE16, + DUPLANE32, + DUPLANE64, + + // Vector immedate moves + MOVI, + MOVIshift, + MOVIedit, + MOVImsl, + FMOV, + MVNIshift, + MVNImsl, + + // Vector immediate ops + BICi, + ORRi, + + // Vector bit select: similar to ISD::VSELECT but not all bits within an + // element must be identical. + BSL, + + // Vector arithmetic negation + NEG, + + // Vector shuffles + ZIP1, + ZIP2, + UZP1, + UZP2, + TRN1, + TRN2, + REV16, + REV32, + REV64, + EXT, + + // Vector shift by scalar + VSHL, + VLSHR, + VASHR, + + // Vector shift by scalar (again) + SQSHL_I, + UQSHL_I, + SQSHLU_I, + SRSHR_I, + URSHR_I, + + // Vector comparisons + CMEQ, + CMGE, + CMGT, + CMHI, + CMHS, + FCMEQ, + FCMGE, + FCMGT, + + // Vector zero comparisons + CMEQz, + CMGEz, + CMGTz, + CMLEz, + CMLTz, + FCMEQz, + FCMGEz, + FCMGTz, + FCMLEz, + FCMLTz, + + // Vector across-lanes addition + // Only the lower result lane is defined. + SADDV, + UADDV, + + // Vector across-lanes min/max + // Only the lower result lane is defined. + SMINV, + UMINV, + SMAXV, + UMAXV, + + // Vector bitwise negation + NOT, + + // Vector bitwise selection + BIT, + + // Compare-and-branch + CBZ, + CBNZ, + TBZ, + TBNZ, + + // Tail calls + TC_RETURN, + + // Custom prefetch handling + PREFETCH, + + // {s|u}int to FP within a FP register. + SITOF, + UITOF, + + /// Natural vector cast. ISD::BITCAST is not natural in the big-endian + /// world w.r.t vectors; which causes additional REV instructions to be + /// generated to compensate for the byte-swapping. But sometimes we do + /// need to re-interpret the data in SIMD vector registers in big-endian + /// mode without emitting such REV instructions. + NVCAST, + + SMULL, + UMULL, + + // NEON Load/Store with post-increment base updates + LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, + LD3post, + LD4post, + ST2post, + ST3post, + ST4post, + LD1x2post, + LD1x3post, + LD1x4post, + ST1x2post, + ST1x3post, + ST1x4post, + LD1DUPpost, + LD2DUPpost, + LD3DUPpost, + LD4DUPpost, + LD1LANEpost, + LD2LANEpost, + LD3LANEpost, + LD4LANEpost, + ST2LANEpost, + ST3LANEpost, + ST4LANEpost +}; + +} // end namespace AArch64ISD + +class AArch64Subtarget; +class AArch64TargetMachine; + +class AArch64TargetLowering : public TargetLowering { +public: + explicit AArch64TargetLowering(const TargetMachine &TM, + const AArch64Subtarget &STI); + + /// Selects the correct CCAssignFn for a given CallingConvention value. + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; + + /// Determine which of the bits specified in Mask are known to be either zero + /// or one and return them in the KnownZero/KnownOne bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, + APInt &KnownOne, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; + + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, + unsigned Align = 1, + bool *Fast = nullptr) const override; + + /// Provide custom lowering hooks for some operations. + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + const char *getTargetNodeName(unsigned Opcode) const override; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } + + /// This method returns a target specific FastISel object, or null if the + /// target does not support "fast" ISel. + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + + /// Return true if the given shuffle mask can be codegen'd directly, or if it + /// should be stack expanded. + bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; + + /// Return the ISD::SETCC ValueType. + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; + + MachineBasicBlock *EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const override; + + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + bool isProfitableToHoist(Instruction *I) const override; + + bool isZExtFree(Type *Ty1, Type *Ty2) const override; + bool isZExtFree(EVT VT1, EVT VT2) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool hasPairedLoad(Type *LoadedType, + unsigned &RequiredAligment) const override; + bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; + + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef<ShuffleVectorInst *> Shuffles, + ArrayRef<unsigned> Indices, + unsigned Factor) const override; + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const override; + + bool isLegalAddImmediate(int64_t) const override; + bool isLegalICmpImmediate(int64_t) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const override; + + /// Return true if the addressing mode represented by AM is legal for this + /// target, for a load/store of the specified type. + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + /// \brief Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this method + /// returns true, otherwise fmuladd is expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + + /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask. + bool isDesirableToCommuteWithShift(const SDNode *N) const override; + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const override; + Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, AtomicOrdering Ord) const override; + + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + + bool useLoadStackGuardNode() const override; + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + + /// If the target has a standard location for the unsafe stack pointer, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X1; + } + + bool isCheapToSpeculateCttz() const override { + return true; + } + + bool isCheapToSpeculateCtlz() const override { + return true; + } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + +private: + bool isExtFreeImpl(const Instruction *Ext) const override; + + /// Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + + void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT); + void addDRTypeForNEON(MVT VT); + void addQRTypeForNEON(MVT VT); + + SDValue + LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerCall(CallLoweringInfo & /*CLI*/, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, + bool isThisReturn, SDValue ThisVal) const; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + + bool isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; + + /// Finds the incoming stack arguments which overlap the given fixed stack + /// object and incorporates their load into the current chain. This prevents + /// an upcoming store from clobbering the stack argument before it's used. + SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo *MFI, int ClobberedFI) const; + + bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; + + bool IsTailCallConvention(CallingConv::ID CallCC) const; + + void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const; + + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, + SelectionDAG &DAG) const override; + + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, + SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, + SDValue TVal, SDValue FVal, SDLoc dl, + SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + std::vector<SDNode *> *Created) const override; + unsigned combineRepeatedFPDivisors() const override; + + ConstraintType getConstraintType(StringRef Constraint) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight + getSingleConstraintMatchWeight(AsmOperandInfo &info, + const char *constraint) const override; + + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + + unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "Q") + return InlineAsm::Constraint_Q; + // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are + // followed by llvm_unreachable so we'll leave them unimplemented in + // the backend for now. + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; + bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, bool &IsInc, + SelectionDAG &DAG) const; + bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, + SDValue &Offset, ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const override; + + bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, + CallingConv::ID CallConv, + bool isVarArg) const override; + + bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; +}; + +namespace AArch64 { +FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); +} // end namespace AArch64 + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td new file mode 100644 index 0000000..4923a11 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -0,0 +1,363 @@ +//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Atomic operand code-gen constructs. +// +//===----------------------------------------------------------------------===// + +//===---------------------------------- +// Atomic fences +//===---------------------------------- +def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; +def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; + +//===---------------------------------- +// Atomic loads +//===---------------------------------- + +// When they're actually atomic, only one addressing mode (GPR64sp) is +// supported, but when they're relaxed and anything can be used, all the +// standard modes would be valid and may give efficiency gains. + +// A atomic load operation that actually needs acquire semantics. +class acquiring_load<PatFrag base> + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + return isAtLeastAcquire(Ordering); +}]>; + +// An atomic load operation that does not need either acquire or release +// semantics. +class relaxed_load<PatFrag base> + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + return !isAtLeastAcquire(Ordering); +}]>; + +// 8-bit loads +def : Pat<(acquiring_load<atomic_load_8> GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend8:$offset)), + (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>; +def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$offset)), + (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>; +def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(relaxed_load<atomic_load_8> + (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; + +// 16-bit loads +def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)), + (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; +def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), + (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; +def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)), + (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(relaxed_load<atomic_load_16> + (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; + +// 32-bit loads +def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend)), + (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend)), + (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn, + uimm12s4:$offset)), + (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_load<atomic_load_32> + (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), + (LDURWi GPR64sp:$Rn, simm9:$offset)>; + +// 64-bit loads +def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>; +def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend)), + (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend)), + (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset)), + (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(relaxed_load<atomic_load_64> + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (LDURXi GPR64sp:$Rn, simm9:$offset)>; + +//===---------------------------------- +// Atomic stores +//===---------------------------------- + +// When they're actually atomic, only one addressing mode (GPR64sp) is +// supported, but when they're relaxed and anything can be used, all the +// standard modes would be valid and may give efficiency gains. + +// A store operation that actually needs release semantics. +class releasing_store<PatFrag base> + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + assert(Ordering != AcquireRelease && "unexpected store ordering"); + return isAtLeastRelease(Ordering); +}]>; + +// An atomic store operation that doesn't actually need to be atomic on AArch64. +class relaxed_store<PatFrag base> + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); + return !isAtLeastRelease(Ordering); +}]>; + +// 8-bit stores +def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val), + (STLRB GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store<atomic_store_8> + (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend), + GPR32:$val), + (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>; +def : Pat<(relaxed_store<atomic_store_8> + (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend), + GPR32:$val), + (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>; +def : Pat<(relaxed_store<atomic_store_8> + (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val), + (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(relaxed_store<atomic_store_8> + (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 16-bit stores +def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val), + (STLRH GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend), + GPR32:$val), + (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; +def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend), + GPR32:$val), + (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; +def : Pat<(relaxed_store<atomic_store_16> + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val), + (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(relaxed_store<atomic_store_16> + (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 32-bit stores +def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val), + (STLRW GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend), + GPR32:$val), + (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend), + GPR32:$val), + (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_store<atomic_store_32> + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val), + (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store<atomic_store_32> + (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 64-bit stores +def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val), + (STLRX GPR64:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend), + GPR64:$val), + (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend), + GPR64:$val), + (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_store<atomic_store_64> + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val), + (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(relaxed_store<atomic_store_64> + (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val), + (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>; + +//===---------------------------------- +// Low-level exclusive operations +//===---------------------------------- + +// Load-exclusives. + +def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + +def : Pat<(ldxr_1 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_2 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_4 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>; + +def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff), + (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff), + (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), + (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; + +// Load-exclusives. + +def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + +def : Pat<(ldaxr_1 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_2 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_4 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>; + +def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff), + (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff), + (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), + (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; + +// Store-exclusives. + +def stxr_1 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def stxr_2 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def stxr_4 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def stxr_8 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + + +def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), + (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr), + (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr), + (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr), + (STXRX GPR64:$val, GPR64sp:$addr)>; + +def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), + (STXRB GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), + (STXRH GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr), + (STXRW GPR32:$val, GPR64sp:$addr)>; + +def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), + (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), + (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), + (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; + +// Store-release-exclusives. + +def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; + +def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; + +def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; + + +def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), + (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr), + (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr), + (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr), + (STLXRX GPR64:$val, GPR64sp:$addr)>; + +def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), + (STLXRB GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), + (STLXRH GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr), + (STLXRW GPR32:$val, GPR64sp:$addr)>; + +def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), + (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), + (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), + (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; + + +// And clear exclusive. + +def : Pat<(int_aarch64_clrex), (CLREX 0xf)>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td new file mode 100644 index 0000000..6ac2175 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -0,0 +1,9512 @@ +//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe AArch64 instructions format here +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<2> val> { + bits<2> Value = val; +} + +def PseudoFrm : Format<0>; +def NormalFrm : Format<1>; // Do we need any others? + +// AArch64 Instruction Format +class AArch64Inst<Format f, string cstr> : Instruction { + field bits<32> Inst; // Instruction encoding. + // Mask of bits that cause an encoding to be UNPREDICTABLE. + // If a bit is set, then if the corresponding bit in the + // target encoding differs from its value in the "Inst" field, + // the instruction is UNPREDICTABLE (SoftFail in abstract parlance). + field bits<32> Unpredictable = 0; + // SoftFail is the generic name for this field, but we alias it so + // as to make it more obvious what it means in ARM-land. + field bits<32> SoftFail = Unpredictable; + let Namespace = "AArch64"; + Format F = f; + bits<2> Form = F.Value; + let Pattern = []; + let Constraints = cstr; +} + +// Pseudo instructions (don't have encoding information) +class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = ""> + : AArch64Inst<PseudoFrm, cstr> { + dag OutOperandList = oops; + dag InOperandList = iops; + let Pattern = pattern; + let isCodeGenOnly = 1; +} + +// Real instructions (have encoding information) +class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> { + let Pattern = pattern; + let Size = 4; +} + +// Normal instructions +class I<dag oops, dag iops, string asm, string operands, string cstr, + list<dag> pattern> + : EncodedI<cstr, pattern> { + dag OutOperandList = oops; + dag InOperandList = iops; + let AsmString = !strconcat(asm, operands); +} + +class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag<dag res> : PatFrag<(ops node:$LHS), res>; + +// Helper fragment for an extract of the high portion of a 128-bit vector. +def extract_high_v16i8 : + UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; +def extract_high_v8i16 : + UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; +def extract_high_v4i32 : + UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; +def extract_high_v2i64 : + UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; + +//===----------------------------------------------------------------------===// +// Asm Operand Classes. +// + +// Shifter operand for arithmetic shifted encodings. +def ShifterOperand : AsmOperandClass { + let Name = "Shifter"; +} + +// Shifter operand for mov immediate encodings. +def MovImm32ShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MovImm32Shifter"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "InvalidMovImm32Shift"; +} +def MovImm64ShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MovImm64Shifter"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "InvalidMovImm64Shift"; +} + +// Shifter operand for arithmetic register shifted encodings. +class ArithmeticShifterOperand<int width> : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "ArithmeticShifter" # width; + let PredicateMethod = "isArithmeticShifter<" # width # ">"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "AddSubRegShift" # width; +} + +def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>; +def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>; + +// Shifter operand for logical register shifted encodings. +class LogicalShifterOperand<int width> : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "LogicalShifter" # width; + let PredicateMethod = "isLogicalShifter<" # width # ">"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "AddSubRegShift" # width; +} + +def LogicalShifterOperand32 : LogicalShifterOperand<32>; +def LogicalShifterOperand64 : LogicalShifterOperand<64>; + +// Shifter operand for logical vector 128/64-bit shifted encodings. +def LogicalVecShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "LogicalVecShifter"; + let RenderMethod = "addShifterOperands"; +} +def LogicalVecHalfWordShifterOperand : AsmOperandClass { + let SuperClasses = [LogicalVecShifterOperand]; + let Name = "LogicalVecHalfWordShifter"; + let RenderMethod = "addShifterOperands"; +} + +// The "MSL" shifter on the vector MOVI instruction. +def MoveVecShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MoveVecShifter"; + let RenderMethod = "addShifterOperands"; +} + +// Extend operand for arithmetic encodings. +def ExtendOperand : AsmOperandClass { + let Name = "Extend"; + let DiagnosticType = "AddSubRegExtendLarge"; +} +def ExtendOperand64 : AsmOperandClass { + let SuperClasses = [ExtendOperand]; + let Name = "Extend64"; + let DiagnosticType = "AddSubRegExtendSmall"; +} +// 'extend' that's a lsl of a 64-bit register. +def ExtendOperandLSL64 : AsmOperandClass { + let SuperClasses = [ExtendOperand]; + let Name = "ExtendLSL64"; + let RenderMethod = "addExtend64Operands"; + let DiagnosticType = "AddSubRegExtendLarge"; +} + +// 8-bit floating-point immediate encodings. +def FPImmOperand : AsmOperandClass { + let Name = "FPImm"; + let ParserMethod = "tryParseFPImm"; + let DiagnosticType = "InvalidFPImm"; +} + +def CondCode : AsmOperandClass { + let Name = "CondCode"; + let DiagnosticType = "InvalidCondCode"; +} + +// A 32-bit register pasrsed as 64-bit +def GPR32as64Operand : AsmOperandClass { + let Name = "GPR32as64"; +} +def GPR32as64 : RegisterOperand<GPR32> { + let ParserMatchClass = GPR32as64Operand; +} + +// 8-bit immediate for AdvSIMD where 64-bit values of the form: +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// are encoded as the eight bit value 'abcdefgh'. +def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; } + + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// ADR[P] instruction labels. +def AdrpOperand : AsmOperandClass { + let Name = "AdrpLabel"; + let ParserMethod = "tryParseAdrpLabel"; + let DiagnosticType = "InvalidLabel"; +} +def adrplabel : Operand<i64> { + let EncoderMethod = "getAdrLabelOpValue"; + let PrintMethod = "printAdrpLabel"; + let ParserMatchClass = AdrpOperand; +} + +def AdrOperand : AsmOperandClass { + let Name = "AdrLabel"; + let ParserMethod = "tryParseAdrLabel"; + let DiagnosticType = "InvalidLabel"; +} +def adrlabel : Operand<i64> { + let EncoderMethod = "getAdrLabelOpValue"; + let ParserMatchClass = AdrOperand; +} + +// simm9 predicate - True if the immediate is in the range [-256, 255]. +def SImm9Operand : AsmOperandClass { + let Name = "SImm9"; + let DiagnosticType = "InvalidMemoryIndexedSImm9"; +} +def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> { + let ParserMatchClass = SImm9Operand; +} + +// simm7sN predicate - True if the immediate is a multiple of N in the range +// [-64 * N, 63 * N]. +class SImm7Scaled<int Scale> : AsmOperandClass { + let Name = "SImm7s" # Scale; + let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7"; +} + +def SImm7s4Operand : SImm7Scaled<4>; +def SImm7s8Operand : SImm7Scaled<8>; +def SImm7s16Operand : SImm7Scaled<16>; + +def simm7s4 : Operand<i32> { + let ParserMatchClass = SImm7s4Operand; + let PrintMethod = "printImmScale<4>"; +} + +def simm7s8 : Operand<i32> { + let ParserMatchClass = SImm7s8Operand; + let PrintMethod = "printImmScale<8>"; +} + +def simm7s16 : Operand<i32> { + let ParserMatchClass = SImm7s16Operand; + let PrintMethod = "printImmScale<16>"; +} + +def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; +def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; +def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; +def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>; +def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; + +class AsmImmRange<int Low, int High> : AsmOperandClass { + let Name = "Imm" # Low # "_" # High; + let DiagnosticType = "InvalidImm" # Low # "_" # High; +} + +def Imm1_8Operand : AsmImmRange<1, 8>; +def Imm1_16Operand : AsmImmRange<1, 16>; +def Imm1_32Operand : AsmImmRange<1, 32>; +def Imm1_64Operand : AsmImmRange<1, 64>; + +def MovZSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG3"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g3 : Operand<i32> { + let ParserMatchClass = MovZSymbolG3AsmOperand; +} + +def MovZSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG2"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g2 : Operand<i32> { + let ParserMatchClass = MovZSymbolG2AsmOperand; +} + +def MovZSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG1"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g1 : Operand<i32> { + let ParserMatchClass = MovZSymbolG1AsmOperand; +} + +def MovZSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG0"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g0 : Operand<i32> { + let ParserMatchClass = MovZSymbolG0AsmOperand; +} + +def MovKSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG3"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g3 : Operand<i32> { + let ParserMatchClass = MovKSymbolG3AsmOperand; +} + +def MovKSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG2"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g2 : Operand<i32> { + let ParserMatchClass = MovKSymbolG2AsmOperand; +} + +def MovKSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG1"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g1 : Operand<i32> { + let ParserMatchClass = MovKSymbolG1AsmOperand; +} + +def MovKSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG0"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g0 : Operand<i32> { + let ParserMatchClass = MovKSymbolG0AsmOperand; +} + +class fixedpoint_i32<ValueType FloatVT> + : Operand<FloatVT>, + ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> { + let EncoderMethod = "getFixedPointScaleOpValue"; + let DecoderMethod = "DecodeFixedPointScaleImm32"; + let ParserMatchClass = Imm1_32Operand; +} + +class fixedpoint_i64<ValueType FloatVT> + : Operand<FloatVT>, + ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> { + let EncoderMethod = "getFixedPointScaleOpValue"; + let DecoderMethod = "DecodeFixedPointScaleImm64"; + let ParserMatchClass = Imm1_64Operand; +} + +def fixedpoint_f16_i32 : fixedpoint_i32<f16>; +def fixedpoint_f32_i32 : fixedpoint_i32<f32>; +def fixedpoint_f64_i32 : fixedpoint_i32<f64>; + +def fixedpoint_f16_i64 : fixedpoint_i64<f16>; +def fixedpoint_f32_i64 : fixedpoint_i64<f32>; +def fixedpoint_f64_i64 : fixedpoint_i64<f64>; + +def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR8OpValue"; + let DecoderMethod = "DecodeVecShiftR8Imm"; + let ParserMatchClass = Imm1_8Operand; +} +def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16Imm"; + let ParserMatchClass = Imm1_16Operand; +} +def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16ImmNarrow"; + let ParserMatchClass = Imm1_8Operand; +} +def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32Imm"; + let ParserMatchClass = Imm1_32Operand; +} +def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32ImmNarrow"; + let ParserMatchClass = Imm1_16Operand; +} +def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64Imm"; + let ParserMatchClass = Imm1_64Operand; +} +def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64ImmNarrow"; + let ParserMatchClass = Imm1_32Operand; +} + +def Imm0_1Operand : AsmImmRange<0, 1>; +def Imm0_7Operand : AsmImmRange<0, 7>; +def Imm0_15Operand : AsmImmRange<0, 15>; +def Imm0_31Operand : AsmImmRange<0, 31>; +def Imm0_63Operand : AsmImmRange<0, 63>; + +def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) < 8); +}]> { + let EncoderMethod = "getVecShiftL8OpValue"; + let DecoderMethod = "DecodeVecShiftL8Imm"; + let ParserMatchClass = Imm0_7Operand; +} +def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) < 16); +}]> { + let EncoderMethod = "getVecShiftL16OpValue"; + let DecoderMethod = "DecodeVecShiftL16Imm"; + let ParserMatchClass = Imm0_15Operand; +} +def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) < 32); +}]> { + let EncoderMethod = "getVecShiftL32OpValue"; + let DecoderMethod = "DecodeVecShiftL32Imm"; + let ParserMatchClass = Imm0_31Operand; +} +def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{ + return (((uint32_t)Imm) < 64); +}]> { + let EncoderMethod = "getVecShiftL64OpValue"; + let DecoderMethod = "DecodeVecShiftL64Imm"; + let ParserMatchClass = Imm0_63Operand; +} + + +// Crazy immediate formats used by 32-bit and 64-bit logical immediate +// instructions for splatting repeating bit patterns across the immediate. +def logical_imm32_XFORM : SDNodeXForm<imm, [{ + uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); +}]>; +def logical_imm64_XFORM : SDNodeXForm<imm, [{ + uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); +}]>; + +let DiagnosticType = "LogicalSecondSource" in { + def LogicalImm32Operand : AsmOperandClass { + let Name = "LogicalImm32"; + } + def LogicalImm64Operand : AsmOperandClass { + let Name = "LogicalImm64"; + } + def LogicalImm32NotOperand : AsmOperandClass { + let Name = "LogicalImm32Not"; + } + def LogicalImm64NotOperand : AsmOperandClass { + let Name = "LogicalImm64Not"; + } +} +def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{ + return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32); +}], logical_imm32_XFORM> { + let PrintMethod = "printLogicalImm32"; + let ParserMatchClass = LogicalImm32Operand; +} +def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{ + return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64); +}], logical_imm64_XFORM> { + let PrintMethod = "printLogicalImm64"; + let ParserMatchClass = LogicalImm64Operand; +} +def logical_imm32_not : Operand<i32> { + let ParserMatchClass = LogicalImm32NotOperand; +} +def logical_imm64_not : Operand<i64> { + let ParserMatchClass = LogicalImm64NotOperand; +} + +// imm0_65535 predicate - True if the immediate is in the range [0,65535]. +def Imm0_65535Operand : AsmImmRange<0, 65535>; +def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint32_t)Imm) < 65536; +}]> { + let ParserMatchClass = Imm0_65535Operand; + let PrintMethod = "printHexImm"; +} + +// imm0_255 predicate - True if the immediate is in the range [0,255]. +def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; } +def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint32_t)Imm) < 256; +}]> { + let ParserMatchClass = Imm0_255Operand; + let PrintMethod = "printHexImm"; +} + +// imm0_127 predicate - True if the immediate is in the range [0,127] +def Imm0_127Operand : AsmImmRange<0, 127>; +def imm0_127 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint32_t)Imm) < 128; +}]> { + let ParserMatchClass = Imm0_127Operand; + let PrintMethod = "printHexImm"; +} + +// NOTE: These imm0_N operands have to be of type i64 because i64 is the size +// for all shift-amounts. + +// imm0_63 predicate - True if the immediate is in the range [0,63] +def imm0_63 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 64; +}]> { + let ParserMatchClass = Imm0_63Operand; +} + +// imm0_31 predicate - True if the immediate is in the range [0,31] +def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + +// True if the 32-bit immediate is in the range [0,31] +def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + +// imm0_1 predicate - True if the immediate is in the range [0,1] +def imm0_1 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = Imm0_1Operand; +} + +// imm0_15 predicate - True if the immediate is in the range [0,15] +def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 16; +}]> { + let ParserMatchClass = Imm0_15Operand; +} + +// imm0_7 predicate - True if the immediate is in the range [0,7] +def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 8; +}]> { + let ParserMatchClass = Imm0_7Operand; +} + +// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] +def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint32_t)Imm) < 16; +}]> { + let ParserMatchClass = Imm0_15Operand; +} + +// An arithmetic shifter operand: +// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr +// {5-0} - imm6 +class arith_shift<ValueType Ty, int width> : Operand<Ty> { + let PrintMethod = "printShifter"; + let ParserMatchClass = !cast<AsmOperandClass>( + "ArithmeticShifterOperand" # width); +} + +def arith_shift32 : arith_shift<i32, 32>; +def arith_shift64 : arith_shift<i64, 64>; + +class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width> + : Operand<Ty>, + ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> { + let PrintMethod = "printShiftedRegister"; + let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width)); +} + +def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>; +def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>; + +// An arithmetic shifter operand: +// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror +// {5-0} - imm6 +class logical_shift<int width> : Operand<i32> { + let PrintMethod = "printShifter"; + let ParserMatchClass = !cast<AsmOperandClass>( + "LogicalShifterOperand" # width); +} + +def logical_shift32 : logical_shift<32>; +def logical_shift64 : logical_shift<64>; + +class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop> + : Operand<Ty>, + ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> { + let PrintMethod = "printShiftedRegister"; + let MIOperandInfo = (ops regclass, shiftop); +} + +def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>; +def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>; + +// A logical vector shifter operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0, #8, #16, or #24 +def logical_vec_shift : Operand<i32> { + let PrintMethod = "printShifter"; + let EncoderMethod = "getVecShifterOpValue"; + let ParserMatchClass = LogicalVecShifterOperand; +} + +// A logical vector half-word shifter operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0 or #8 +def logical_vec_hw_shift : Operand<i32> { + let PrintMethod = "printShifter"; + let EncoderMethod = "getVecShifterOpValue"; + let ParserMatchClass = LogicalVecHalfWordShifterOperand; +} + +// A vector move shifter operand: +// {0} - imm1: #8 or #16 +def move_vec_shift : Operand<i32> { + let PrintMethod = "printShifter"; + let EncoderMethod = "getMoveVecShifterOpValue"; + let ParserMatchClass = MoveVecShifterOperand; +} + +let DiagnosticType = "AddSubSecondSource" in { + def AddSubImmOperand : AsmOperandClass { + let Name = "AddSubImm"; + let ParserMethod = "tryParseAddSubImm"; + } + def AddSubImmNegOperand : AsmOperandClass { + let Name = "AddSubImmNeg"; + let ParserMethod = "tryParseAddSubImm"; + } +} +// An ADD/SUB immediate shifter operand: +// second operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0 or #12 +class addsub_shifted_imm<ValueType Ty> + : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> { + let PrintMethod = "printAddSubImm"; + let EncoderMethod = "getAddSubImmOpValue"; + let ParserMatchClass = AddSubImmOperand; + let MIOperandInfo = (ops i32imm, i32imm); +} + +class addsub_shifted_imm_neg<ValueType Ty> + : Operand<Ty> { + let EncoderMethod = "getAddSubImmOpValue"; + let ParserMatchClass = AddSubImmNegOperand; + let MIOperandInfo = (ops i32imm, i32imm); +} + +def addsub_shifted_imm32 : addsub_shifted_imm<i32>; +def addsub_shifted_imm64 : addsub_shifted_imm<i64>; +def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>; +def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>; + +class neg_addsub_shifted_imm<ValueType Ty> + : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> { + let PrintMethod = "printAddSubImm"; + let EncoderMethod = "getAddSubImmOpValue"; + let ParserMatchClass = AddSubImmOperand; + let MIOperandInfo = (ops i32imm, i32imm); +} + +def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>; +def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>; + +// An extend operand: +// {5-3} - extend type +// {2-0} - imm3 +def arith_extend : Operand<i32> { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperand; +} +def arith_extend64 : Operand<i32> { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperand64; +} + +// 'extend' that's a lsl of a 64-bit register. +def arith_extendlsl64 : Operand<i32> { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperandLSL64; +} + +class arith_extended_reg32<ValueType Ty> : Operand<Ty>, + ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> { + let PrintMethod = "printExtendedRegister"; + let MIOperandInfo = (ops GPR32, arith_extend); +} + +class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, + ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> { + let PrintMethod = "printExtendedRegister"; + let MIOperandInfo = (ops GPR32, arith_extend64); +} + +// Floating-point immediate. +def fpimm16 : Operand<f16>, + PatLeaf<(f16 fpimm), [{ + return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP16Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} +def fpimm32 : Operand<f32>, + PatLeaf<(f32 fpimm), [{ + return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP32Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} +def fpimm64 : Operand<f64>, + PatLeaf<(f64 fpimm), [{ + return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP64Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} + +def fpimm8 : Operand<i32> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +// Vector lane operands +class AsmVectorIndex<string Suffix> : AsmOperandClass { + let Name = "VectorIndex" # Suffix; + let DiagnosticType = "InvalidIndex" # Suffix; +} +def VectorIndex1Operand : AsmVectorIndex<"1">; +def VectorIndexBOperand : AsmVectorIndex<"B">; +def VectorIndexHOperand : AsmVectorIndex<"H">; +def VectorIndexSOperand : AsmVectorIndex<"S">; +def VectorIndexDOperand : AsmVectorIndex<"D">; + +def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) == 1; +}]> { + let ParserMatchClass = VectorIndex1Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 16; +}]> { + let ParserMatchClass = VectorIndexBOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 8; +}]> { + let ParserMatchClass = VectorIndexHOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 4; +}]> { + let ParserMatchClass = VectorIndexSOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = VectorIndexDOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} + +// 8-bit immediate for AdvSIMD where 64-bit values of the form: +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// are encoded as the eight bit value 'abcdefgh'. +def simdimmtype10 : Operand<i32>, + PatLeaf<(f64 fpimm), [{ + return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = SIMDImmType10Operand; + let PrintMethod = "printSIMDType10Operand"; +} + + +//--- +// System management +//--- + +// Base encoding for system instruction operands. +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands, + list<dag> pattern = []> + : I<oops, iops, asm, operands, "", pattern> { + let Inst{31-22} = 0b1101010100; + let Inst{21} = L; +} + +// System instructions which do not have an Rt register. +class SimpleSystemI<bit L, dag iops, string asm, string operands, + list<dag> pattern = []> + : BaseSystemI<L, (outs), iops, asm, operands, pattern> { + let Inst{4-0} = 0b11111; +} + +// System instructions which have an Rt register. +class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> + : BaseSystemI<L, oops, iops, asm, operands>, + Sched<[WriteSys]> { + bits<5> Rt; + let Inst{4-0} = Rt; +} + +// Hint instructions that take both a CRm and a 3-bit immediate. +// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot +// model patterns with sufficiently fine granularity +let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in + class HintI<string mnemonic> + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "", + [(int_aarch64_hint imm0_127:$imm)]>, + Sched<[WriteHint]> { + bits <7> imm; + let Inst{20-12} = 0b000110010; + let Inst{11-5} = imm; + } + +// System instructions taking a single literal operand which encodes into +// CRm. op2 differentiates the opcodes. +def BarrierAsmOperand : AsmOperandClass { + let Name = "Barrier"; + let ParserMethod = "tryParseBarrierOperand"; +} +def barrier_op : Operand<i32> { + let PrintMethod = "printBarrierOption"; + let ParserMatchClass = BarrierAsmOperand; +} +class CRmSystemI<Operand crmtype, bits<3> opc, string asm, + list<dag> pattern = []> + : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>, + Sched<[WriteBarrier]> { + bits<4> CRm; + let Inst{20-12} = 0b000110011; + let Inst{11-8} = CRm; + let Inst{7-5} = opc; +} + +// MRS/MSR system instructions. These have different operand classes because +// a different subset of registers can be accessed through each instruction. +def MRSSystemRegisterOperand : AsmOperandClass { + let Name = "MRSSystemRegister"; + let ParserMethod = "tryParseSysReg"; + let DiagnosticType = "MRS"; +} +// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate. +def mrs_sysreg_op : Operand<i32> { + let ParserMatchClass = MRSSystemRegisterOperand; + let DecoderMethod = "DecodeMRSSystemRegister"; + let PrintMethod = "printMRSSystemRegister"; +} + +def MSRSystemRegisterOperand : AsmOperandClass { + let Name = "MSRSystemRegister"; + let ParserMethod = "tryParseSysReg"; + let DiagnosticType = "MSR"; +} +def msr_sysreg_op : Operand<i32> { + let ParserMatchClass = MSRSystemRegisterOperand; + let DecoderMethod = "DecodeMSRSystemRegister"; + let PrintMethod = "printMSRSystemRegister"; +} + +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand<i32> { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + +class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), + "mrs", "\t$Rt, $systemreg"> { + bits<16> systemreg; + let Inst{20-5} = systemreg; +} + +// FIXME: Some of these def NZCV, others don't. Best way to model that? +// Explicitly modeling each of the system register as a register class +// would do it, but feels like overkill at this point. +class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), + "msr", "\t$systemreg, $Rt"> { + bits<16> systemreg; + let Inst{20-5} = systemreg; +} + +def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_15"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield4_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_15Operand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateImm0_15 + : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), + "msr", "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bits<4> imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-12} = 0b0100; + let Inst{11-8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; +} + +def SystemPStateFieldWithImm0_1Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_1"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield1_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_1Operand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateImm0_1 + : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), + "msr", "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bit imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-9} = 0b0100000; + let Inst{8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; +} + +// SYS and SYSL generic system instructions. +def SysCRAsmOperand : AsmOperandClass { + let Name = "SysCR"; + let ParserMethod = "tryParseSysCROperand"; +} + +def sys_cr_op : Operand<i32> { + let PrintMethod = "printSysCROperand"; + let ParserMatchClass = SysCRAsmOperand; +} + +class SystemXtI<bit L, string asm> + : RtSystemI<L, (outs), + (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt), + asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> { + bits<3> op1; + bits<4> Cn; + bits<4> Cm; + bits<3> op2; + let Inst{20-19} = 0b01; + let Inst{18-16} = op1; + let Inst{15-12} = Cn; + let Inst{11-8} = Cm; + let Inst{7-5} = op2; +} + +class SystemLXtI<bit L, string asm> + : RtSystemI<L, (outs), + (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2), + asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> { + bits<3> op1; + bits<4> Cn; + bits<4> Cm; + bits<3> op2; + let Inst{20-19} = 0b01; + let Inst{18-16} = op1; + let Inst{15-12} = Cn; + let Inst{11-8} = Cm; + let Inst{7-5} = op2; +} + + +// Branch (register) instructions: +// +// case opc of +// 0001 blr +// 0000 br +// 0101 dret +// 0100 eret +// 0010 ret +// otherwise UNDEFINED +class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm, + string operands, list<dag> pattern> + : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> { + let Inst{31-25} = 0b1101011; + let Inst{24-21} = opc; + let Inst{20-16} = 0b11111; + let Inst{15-10} = 0b000000; + let Inst{4-0} = 0b00000; +} + +class BranchReg<bits<4> opc, string asm, list<dag> pattern> + : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> { + bits<5> Rn; + let Inst{9-5} = Rn; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in +class SpecialReturn<bits<4> opc, string asm> + : BaseBranchReg<opc, (outs), (ins), asm, "", []> { + let Inst{9-5} = 0b11111; +} + +//--- +// Conditional branch instruction. +//--- + +// Condition code. +// 4-bit immediate. Pretty-printed as <cc> +def ccode : Operand<i32> { + let PrintMethod = "printCondCode"; + let ParserMatchClass = CondCode; +} +def inv_ccode : Operand<i32> { + // AL and NV are invalid in the aliases which use inv_ccode + let PrintMethod = "printInverseCondCode"; + let ParserMatchClass = CondCode; + let MCOperandPredicate = [{ + return MCOp.isImm() && + MCOp.getImm() != AArch64CC::AL && + MCOp.getImm() != AArch64CC::NV; + }]; +} + +// Conditional branch target. 19-bit immediate. The low two bits of the target +// offset are implied zero and so are not part of the immediate. +def PCRelLabel19Operand : AsmOperandClass { + let Name = "PCRelLabel19"; + let DiagnosticType = "InvalidLabel"; +} +def am_brcond : Operand<OtherVT> { + let EncoderMethod = "getCondBranchTargetOpValue"; + let DecoderMethod = "DecodePCRelLabel19"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = PCRelLabel19Operand; +} + +class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), + "b", ".$cond\t$target", "", + [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + let Uses = [NZCV]; + + bits<4> cond; + bits<19> target; + let Inst{31-24} = 0b01010100; + let Inst{23-5} = target; + let Inst{4} = 0; + let Inst{3-0} = cond; +} + +//--- +// Compare-and-branch instructions. +//--- +class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node> + : I<(outs), (ins regtype:$Rt, am_brcond:$target), + asm, "\t$Rt, $target", "", + [(node regtype:$Rt, bb:$target)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + + bits<5> Rt; + bits<19> target; + let Inst{30-25} = 0b011010; + let Inst{24} = op; + let Inst{23-5} = target; + let Inst{4-0} = Rt; +} + +multiclass CmpBranch<bit op, string asm, SDNode node> { + def W : BaseCmpBranch<GPR32, op, asm, node> { + let Inst{31} = 0; + } + def X : BaseCmpBranch<GPR64, op, asm, node> { + let Inst{31} = 1; + } +} + +//--- +// Test-bit-and-branch instructions. +//--- +// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of +// the target offset are implied zero and so are not part of the immediate. +def BranchTarget14Operand : AsmOperandClass { + let Name = "BranchTarget14"; +} +def am_tbrcond : Operand<OtherVT> { + let EncoderMethod = "getTestBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget14Operand; +} + +// AsmOperand classes to emit (or not) special diagnostics +def TBZImm0_31Operand : AsmOperandClass { + let Name = "TBZImm0_31"; + let PredicateMethod = "isImm0_31"; + let RenderMethod = "addImm0_31Operands"; +} +def TBZImm32_63Operand : AsmOperandClass { + let Name = "Imm32_63"; + let DiagnosticType = "InvalidImm0_63"; +} + +class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{ + return (((uint32_t)Imm) < 32); +}]> { + let ParserMatchClass = matcher; +} + +def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>; +def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>; + +def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{ + return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64); +}]> { + let ParserMatchClass = TBZImm32_63Operand; +} + +class BaseTestBranch<RegisterClass regtype, Operand immtype, + bit op, string asm, SDNode node> + : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target), + asm, "\t$Rt, $bit_off, $target", "", + [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + + bits<5> Rt; + bits<6> bit_off; + bits<14> target; + + let Inst{30-25} = 0b011011; + let Inst{24} = op; + let Inst{23-19} = bit_off{4-0}; + let Inst{18-5} = target; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeTestAndBranch"; +} + +multiclass TestBranch<bit op, string asm, SDNode node> { + def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> { + let Inst{31} = 0; + } + + def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> { + let Inst{31} = 1; + } + + // Alias X-reg with 0-31 imm to W-Reg. + def : InstAlias<asm # "\t$Rd, $imm, $target", + (!cast<Instruction>(NAME#"W") GPR32as64:$Rd, + tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>; + def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target), + (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32), + tbz_imm0_31_diag:$imm, bb:$target)>; +} + +//--- +// Unconditional branch (immediate) instructions. +//--- +def BranchTarget26Operand : AsmOperandClass { + let Name = "BranchTarget26"; + let DiagnosticType = "InvalidLabel"; +} +def am_b_target : Operand<OtherVT> { + let EncoderMethod = "getBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget26Operand; +} +def am_bl_target : Operand<i64> { + let EncoderMethod = "getBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget26Operand; +} + +class BImm<bit op, dag iops, string asm, list<dag> pattern> + : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> { + bits<26> addr; + let Inst{31} = op; + let Inst{30-26} = 0b00101; + let Inst{25-0} = addr; + + let DecoderMethod = "DecodeUnconditionalBranch"; +} + +class BranchImm<bit op, string asm, list<dag> pattern> + : BImm<op, (ins am_b_target:$addr), asm, pattern>; +class CallImm<bit op, string asm, list<dag> pattern> + : BImm<op, (ins am_bl_target:$addr), asm, pattern>; + +//--- +// Basic one-operand data processing instructions. +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm, + SDPatternOperator node> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", + [(set regtype:$Rd, (node regtype:$Rn))]>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + + let Inst{30-13} = 0b101101011000000000; + let Inst{12-10} = opc; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass OneOperandData<bits<3> opc, string asm, + SDPatternOperator node = null_frag> { + def Wr : BaseOneOperandData<opc, GPR32, asm, node> { + let Inst{31} = 0; + } + + def Xr : BaseOneOperandData<opc, GPR64, asm, node> { + let Inst{31} = 1; + } +} + +class OneWRegData<bits<3> opc, string asm, SDPatternOperator node> + : BaseOneOperandData<opc, GPR32, asm, node> { + let Inst{31} = 0; +} + +class OneXRegData<bits<3> opc, string asm, SDPatternOperator node> + : BaseOneOperandData<opc, GPR64, asm, node> { + let Inst{31} = 1; +} + +//--- +// Basic two-operand data processing instructions. +//--- +class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm, + list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{30} = isSub; + let Inst{28-21} = 0b11010000; + let Inst{20-16} = Rm; + let Inst{15-10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm, + SDNode OpNode> + : BaseBaseAddSubCarry<isSub, regtype, asm, + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>; + +class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm, + SDNode OpNode> + : BaseBaseAddSubCarry<isSub, regtype, asm, + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)), + (implicit NZCV)]> { + let Defs = [NZCV]; +} + +multiclass AddSubCarry<bit isSub, string asm, string asm_setflags, + SDNode OpNode, SDNode OpNode_setflags> { + def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> { + let Inst{31} = 0; + let Inst{29} = 0; + } + def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> { + let Inst{31} = 1; + let Inst{29} = 0; + } + + // Sets flags. + def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags, + OpNode_setflags> { + let Inst{31} = 0; + let Inst{29} = 1; + } + def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags, + OpNode_setflags> { + let Inst{31} = 1; + let Inst{29} = 1; + } +} + +class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm, + SDPatternOperator OpNode> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{30-21} = 0b0011010110; + let Inst{20-16} = Rm; + let Inst{15-14} = 0b00; + let Inst{13-10} = opc; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseDiv<bit isSigned, RegisterClass regtype, string asm, + SDPatternOperator OpNode> + : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> { + let Inst{10} = isSigned; +} + +multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> { + def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>, + Sched<[WriteID32, ReadID, ReadID]> { + let Inst{31} = 0; + } + def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>, + Sched<[WriteID64, ReadID, ReadID]> { + let Inst{31} = 1; + } +} + +class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm, + SDPatternOperator OpNode = null_frag> + : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>, + Sched<[WriteIS, ReadI]> { + let Inst{11-10} = shift_type; +} + +multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> { + def Wr : BaseShift<shift_type, GPR32, asm> { + let Inst{31} = 0; + } + + def Xr : BaseShift<shift_type, GPR64, asm, OpNode> { + let Inst{31} = 1; + } + + def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)), + (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, + (EXTRACT_SUBREG i64:$Rm, sub_32))>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))), + (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))), + (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), + (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; +} + +class ShiftAlias<string asm, Instruction inst, RegisterClass regtype> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>; + +class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, + RegisterClass addtype, string asm, + list<dag> pattern> + : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra), + asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<5> Ra; + let Inst{30-24} = 0b0011011; + let Inst{23-21} = opc; + let Inst{20-16} = Rm; + let Inst{15} = isSub; + let Inst{14-10} = Ra; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass MulAccum<bit isSub, string asm, SDNode AccNode> { + // MADD/MSUB generation is decided by MachineCombiner.cpp + def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm, + [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>, + Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { + let Inst{31} = 0; + } + + def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm, + [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>, + Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { + let Inst{31} = 1; + } +} + +class WideMulAccum<bit isSub, bits<3> opc, string asm, + SDNode AccNode, SDNode ExtNode> + : BaseMulAccum<isSub, opc, GPR32, GPR64, asm, + [(set GPR64:$Rd, (AccNode GPR64:$Ra, + (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>, + Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { + let Inst{31} = 1; +} + +class MulHi<bits<3> opc, string asm, SDNode OpNode> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>, + Sched<[WriteIM64, ReadIM, ReadIM]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-24} = 0b10011011; + let Inst{23-21} = opc; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 + // (i.e. all bits 1) but is ignored by the processor. + let PostEncoderMethod = "fixMulHigh"; +} + +class MulAccumWAlias<string asm, Instruction inst> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>; +class MulAccumXAlias<string asm, Instruction inst> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>; +class WideMulAccumAlias<string asm, Instruction inst> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>; + +class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg, + SDPatternOperator OpNode, string asm> + : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = sf; + let Inst{30-21} = 0b0011010110; + let Inst{20-16} = Rm; + let Inst{15-13} = 0b010; + let Inst{12} = C; + let Inst{11-10} = sz; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + let Predicates = [HasCRC]; +} + +//--- +// Address generation. +//--- + +class ADRI<bit page, string asm, Operand adr, list<dag> pattern> + : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "", + pattern>, + Sched<[WriteI]> { + bits<5> Xd; + bits<21> label; + let Inst{31} = page; + let Inst{30-29} = label{1-0}; + let Inst{28-24} = 0b10000; + let Inst{23-5} = label{20-2}; + let Inst{4-0} = Xd; + + let DecoderMethod = "DecodeAdrInstruction"; +} + +//--- +// Move immediate. +//--- + +def movimm32_imm : Operand<i32> { + let ParserMatchClass = Imm0_65535Operand; + let EncoderMethod = "getMoveWideImmOpValue"; + let PrintMethod = "printHexImm"; +} +def movimm32_shift : Operand<i32> { + let PrintMethod = "printShifter"; + let ParserMatchClass = MovImm32ShifterOperand; +} +def movimm64_shift : Operand<i32> { + let PrintMethod = "printShifter"; + let ParserMatchClass = MovImm64ShifterOperand; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter, + string asm> + : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift), + asm, "\t$Rd, $imm$shift", "", []>, + Sched<[WriteImm]> { + bits<5> Rd; + bits<16> imm; + bits<6> shift; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100101; + let Inst{22-21} = shift{5-4}; + let Inst{20-5} = imm; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeMoveImmInstruction"; +} + +multiclass MoveImmediate<bits<2> opc, string asm> { + def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> { + let Inst{31} = 0; + } + + def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> { + let Inst{31} = 1; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter, + string asm> + : I<(outs regtype:$Rd), + (ins regtype:$src, movimm32_imm:$imm, shifter:$shift), + asm, "\t$Rd, $imm$shift", "$src = $Rd", []>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<16> imm; + bits<6> shift; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100101; + let Inst{22-21} = shift{5-4}; + let Inst{20-5} = imm; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeMoveImmInstruction"; +} + +multiclass InsertImmediate<bits<2> opc, string asm> { + def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> { + let Inst{31} = 0; + } + + def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> { + let Inst{31} = 1; + } +} + +//--- +// Add/Subtract +//--- + +class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype, + RegisterClass srcRegtype, addsub_shifted_imm immtype, + string asm, SDPatternOperator OpNode> + : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "", + [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<14> imm; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b10001; + let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 + let Inst{21-10} = imm{11-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + let DecoderMethod = "DecodeBaseAddSubImm"; +} + +class BaseAddSubRegPseudo<RegisterClass regtype, + SDPatternOperator OpNode> + : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteI, ReadI, ReadI]>; + +class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype, + arith_shifted_reg shifted_regtype, string asm, + SDPatternOperator OpNode> + : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + // The operands are in order to match the 'addr' MI operands, so we + // don't need an encoder method and by-name matching. Just use the default + // in-order handling. Since we're using by-order, make sure the names + // do not match. + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<8> shift; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-22} = shift{7-6}; + let Inst{21} = 0; + let Inst{20-16} = src2; + let Inst{15-10} = shift{5-0}; + let Inst{9-5} = src1; + let Inst{4-0} = dst; + + let DecoderMethod = "DecodeThreeAddrSRegInstruction"; +} + +class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype, + RegisterClass src1Regtype, Operand src2Regtype, + string asm, SDPatternOperator OpNode> + : I<(outs dstRegtype:$R1), + (ins src1Regtype:$R2, src2Regtype:$R3), + asm, "\t$R1, $R2, $R3", "", + [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>, + Sched<[WriteIEReg, ReadI, ReadIEReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> ext; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-21} = 0b001; + let Inst{20-16} = Rm; + let Inst{15-13} = ext{5-3}; + let Inst{12-10} = ext{2-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeAddSubERegInstruction"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype, + RegisterClass src1Regtype, RegisterClass src2Regtype, + Operand ext_op, string asm> + : I<(outs dstRegtype:$Rd), + (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext), + asm, "\t$Rd, $Rn, $Rm$ext", "", []>, + Sched<[WriteIEReg, ReadI, ReadIEReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> ext; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-21} = 0b001; + let Inst{20-16} = Rm; + let Inst{15} = ext{5}; + let Inst{12-10} = ext{2-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeAddSubERegInstruction"; +} + +// Aliases for register+register add/subtract. +class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, + RegisterClass src1Regtype, RegisterClass src2Regtype, + int shiftExt> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2, + shiftExt)>; + +multiclass AddSub<bit isSub, string mnemonic, string alias, + SDPatternOperator OpNode = null_frag> { + let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in { + // Add/Subtract immediate + // Increase the weight of the immediate variant to try to match it before + // the extended register variant. + // We used to match the register variant before the immediate when the + // register argument could be implicitly zero-extended. + let AddedComplexity = 6 in + def Wri : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32, + mnemonic, OpNode> { + let Inst{31} = 0; + } + let AddedComplexity = 6 in + def Xri : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64, + mnemonic, OpNode> { + let Inst{31} = 1; + } + + // Add/Subtract register - Only used for CodeGen + def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>; + def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>; + + // Add/Subtract shifted register + def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic, + OpNode> { + let Inst{31} = 0; + } + def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic, + OpNode> { + let Inst{31} = 1; + } + } + + // Add/Subtract extended register + let AddedComplexity = 1, hasSideEffects = 0 in { + def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp, + arith_extended_reg32<i32>, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp, + arith_extended_reg32to64<i64>, mnemonic, OpNode> { + let Inst{31} = 1; + } + } + + def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64, + arith_extendlsl64, mnemonic> { + // UXTX and SXTX only. + let Inst{14-13} = 0b11; + let Inst{31} = 1; + } + + // add Rd, Rb, -imm -> sub Rd, Rn, imm + def : InstAlias<alias#"\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, + addsub_shifted_imm32_neg:$imm), 0>; + def : InstAlias<alias#"\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, + addsub_shifted_imm64_neg:$imm), 0>; + + // Register/register aliases with no shift when SP is not used. + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), + GPR32, GPR32, GPR32, 0>; + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), + GPR64, GPR64, GPR64, 0>; + + // Register/register aliases with no shift when either the destination or + // first source register is SP. + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), + GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), + GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias<mnemonic, + !cast<Instruction>(NAME#"Xrx64"), + GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 + def : AddSubRegAlias<mnemonic, + !cast<Instruction>(NAME#"Xrx64"), + GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 +} + +multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, + string alias, string cmpAlias> { + let isCompare = 1, Defs = [NZCV] in { + // Add/Subtract immediate + def Wri : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32, + mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xri : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64, + mnemonic, OpNode> { + let Inst{31} = 1; + } + + // Add/Subtract register + def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>; + def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>; + + // Add/Subtract shifted register + def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic, + OpNode> { + let Inst{31} = 0; + } + def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic, + OpNode> { + let Inst{31} = 1; + } + + // Add/Subtract extended register + let AddedComplexity = 1 in { + def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp, + arith_extended_reg32<i32>, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp, + arith_extended_reg32<i64>, mnemonic, OpNode> { + let Inst{31} = 1; + } + } + + def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64, + arith_extendlsl64, mnemonic> { + // UXTX and SXTX only. + let Inst{14-13} = 0b11; + let Inst{31} = 1; + } + } // Defs = [NZCV] + + // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm + def : InstAlias<alias#"\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, + addsub_shifted_imm32_neg:$imm), 0>; + def : InstAlias<alias#"\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, + addsub_shifted_imm64_neg:$imm), 0>; + + // Compare aliases + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") + WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") + XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") + WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") + XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") + XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") + WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") + XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; + + // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") + WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") + XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; + + // Compare shorthands + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs") + WZR, GPR32:$src1, GPR32:$src2, 0), 5>; + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs") + XZR, GPR64:$src1, GPR64:$src2, 0), 5>; + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx") + WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64") + XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; + + // Register/register aliases with no shift when SP is not used. + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"), + GPR32, GPR32, GPR32, 0>; + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"), + GPR64, GPR64, GPR64, 0>; + + // Register/register aliases with no shift when the first source register + // is SP. + def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"), + GPR32, GPR32sponly, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias<mnemonic, + !cast<Instruction>(NAME#"Xrx64"), + GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 +} + +//--- +// Extract +//--- +def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisPtrTy<3>]>; +def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>; + +class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm, + list<dag> patterns> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm), + asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>, + Sched<[WriteExtr, ReadExtrHi]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> imm; + + let Inst{30-23} = 0b00100111; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15-10} = imm; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass ExtractImm<string asm> { + def Wrri : BaseExtractImm<GPR32, imm0_31, asm, + [(set GPR32:$Rd, + (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> { + let Inst{31} = 0; + let Inst{22} = 0; + // imm<5> must be zero. + let imm{5} = 0; + } + def Xrri : BaseExtractImm<GPR64, imm0_63, asm, + [(set GPR64:$Rd, + (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> { + + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +//--- +// Bitfield +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseBitfieldImm<bits<2> opc, + RegisterClass regtype, Operand imm_type, string asm> + : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms), + asm, "\t$Rd, $Rn, $immr, $imms", "", []>, + Sched<[WriteIS, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<6> immr; + bits<6> imms; + + let Inst{30-29} = opc; + let Inst{28-23} = 0b100110; + let Inst{21-16} = immr; + let Inst{15-10} = imms; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass BitfieldImm<bits<2> opc, string asm> { + def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> { + let Inst{31} = 0; + let Inst{22} = 0; + // imms<5> and immr<5> must be zero, else ReservedValue(). + let Inst{21} = 0; + let Inst{15} = 0; + } + def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> { + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseBitfieldImmWith2RegArgs<bits<2> opc, + RegisterClass regtype, Operand imm_type, string asm> + : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr, + imm_type:$imms), + asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>, + Sched<[WriteIS, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<6> immr; + bits<6> imms; + + let Inst{30-29} = opc; + let Inst{28-23} = 0b100110; + let Inst{21-16} = immr; + let Inst{15-10} = imms; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> { + def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> { + let Inst{31} = 0; + let Inst{22} = 0; + // imms<5> and immr<5> must be zero, else ReservedValue(). + let Inst{21} = 0; + let Inst{15} = 0; + } + def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> { + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +//--- +// Logical +//--- + +// Logical (immediate) +class BaseLogicalImm<bits<2> opc, RegisterClass dregtype, + RegisterClass sregtype, Operand imm_type, string asm, + list<dag> pattern> + : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm), + asm, "\t$Rd, $Rn, $imm", "", pattern>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<13> imm; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100100; + let Inst{22} = imm{12}; + let Inst{21-16} = imm{11-6}; + let Inst{15-10} = imm{5-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeLogicalImmInstruction"; +} + +// Logical (shifted register) +class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype, + logical_shifted_reg shifted_regtype, string asm, + list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + // The operands are in order to match the 'addr' MI operands, so we + // don't need an encoder method and by-name matching. Just use the default + // in-order handling. Since we're using by-order, make sure the names + // do not match. + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<8> shift; + let Inst{30-29} = opc; + let Inst{28-24} = 0b01010; + let Inst{23-22} = shift{7-6}; + let Inst{21} = N; + let Inst{20-16} = src2; + let Inst{15-10} = shift{5-0}; + let Inst{9-5} = src1; + let Inst{4-0} = dst; + + let DecoderMethod = "DecodeThreeAddrSRegInstruction"; +} + +// Aliases for register+register logical instructions. +class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> + : InstAlias<asm#"\t$dst, $src1, $src2", + (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; + +multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, + string Alias> { + let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in + def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic, + [(set GPR32sp:$Rd, (OpNode GPR32:$Rn, + logical_imm32:$imm))]> { + let Inst{31} = 0; + let Inst{22} = 0; // 64-bit version has an additional bit of immediate. + } + let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in + def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic, + [(set GPR64sp:$Rd, (OpNode GPR64:$Rn, + logical_imm64:$imm))]> { + let Inst{31} = 1; + } + + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; +} + +multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, + string Alias> { + let isCompare = 1, Defs = [NZCV] in { + def Wri : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic, + [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> { + let Inst{31} = 0; + let Inst{22} = 0; // 64-bit version has an additional bit of immediate. + } + def Xri : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic, + [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> { + let Inst{31} = 1; + } + } // end Defs = [NZCV] + + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, + logical_imm32_not:$imm), 0>; + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", + (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, + logical_imm64_not:$imm), 0>; +} + +class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode> + : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteI, ReadI, ReadI]>; + +// Split from LogicalImm as not all instructions have both. +multiclass LogicalReg<bits<2> opc, bit N, string mnemonic, + SDPatternOperator OpNode> { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>; + def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>; + } + + def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, + [(set GPR32:$Rd, (OpNode GPR32:$Rn, + logical_shifted_reg32:$Rm))]> { + let Inst{31} = 0; + } + def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic, + [(set GPR64:$Rd, (OpNode GPR64:$Rn, + logical_shifted_reg64:$Rm))]> { + let Inst{31} = 1; + } + + def : LogicalRegAlias<mnemonic, + !cast<Instruction>(NAME#"Wrs"), GPR32>; + def : LogicalRegAlias<mnemonic, + !cast<Instruction>(NAME#"Xrs"), GPR64>; +} + +// Split from LogicalReg to allow setting NZCV Defs +multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, + SDPatternOperator OpNode = null_frag> { + let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>; + def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>; + + def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, + [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> { + let Inst{31} = 0; + } + def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic, + [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> { + let Inst{31} = 1; + } + } // Defs = [NZCV] + + def : LogicalRegAlias<mnemonic, + !cast<Instruction>(NAME#"Wrs"), GPR32>; + def : LogicalRegAlias<mnemonic, + !cast<Instruction>(NAME#"Xrs"), GPR64>; +} + +//--- +// Conditionally set flags +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, + string mnemonic, SDNode OpNode> + : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + + bits<5> Rn; + bits<5> imm; + bits<4> nzcv; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b111010010; + let Inst{20-16} = imm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + let Inst{3-0} = nzcv; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, + SDNode OpNode> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + + bits<5> Rn; + bits<5> Rm; + bits<4> nzcv; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b111010010; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + let Inst{3-0} = nzcv; +} + +multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> { + // immediate operand variants + def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> { + let Inst{31} = 1; + } + // register operand variants + def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> { + let Inst{31} = 1; + } +} + +//--- +// Conditional select +//--- + +class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b011010100; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = op2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass CondSelect<bit op, bits<2> op2, string asm> { + def Wr : BaseCondSelect<op, op2, GPR32, asm> { + let Inst{31} = 0; + } + def Xr : BaseCondSelect<op, op2, GPR64, asm> { + let Inst{31} = 1; + } +} + +class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm, + PatFrag frag> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel regtype:$Rn, (frag regtype:$Rm), + (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b011010100; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = op2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +def inv_cond_XFORM : SDNodeXForm<imm, [{ + AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue()); + return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), SDLoc(N), + MVT::i32); +}]>; + +multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> { + def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> { + let Inst{31} = 0; + } + def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> { + let Inst{31} = 1; + } + + def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV), + (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm, + (inv_cond_XFORM imm:$cond))>; + + def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV), + (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm, + (inv_cond_XFORM imm:$cond))>; +} + +//--- +// Special Mask Value +//--- +def maski8_or_more : Operand<i32>, + ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> { +} +def maski16_or_more : Operand<i32>, + ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> { +} + + +//--- +// Load/store +//--- + +// (unsigned immediate) +// Indexed for 8-bit registers. offset is in range [0,4095]. +def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>; +def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>; +def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>; +def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>; +def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>; + +class UImm12OffsetOperand<int Scale> : AsmOperandClass { + let Name = "UImm12Offset" # Scale; + let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">"; + let PredicateMethod = "isUImm12Offset<" # Scale # ">"; + let DiagnosticType = "InvalidMemoryIndexed" # Scale; +} + +def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>; +def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>; +def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>; +def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>; +def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>; + +class uimm12_scaled<int Scale> : Operand<i64> { + let ParserMatchClass + = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand"); + let EncoderMethod + = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">"; + let PrintMethod = "printUImm12Offset<" # Scale # ">"; +} + +def uimm12s1 : uimm12_scaled<1>; +def uimm12s2 : uimm12_scaled<2>; +def uimm12s4 : uimm12_scaled<4>; +def uimm12s8 : uimm12_scaled<8>; +def uimm12s16 : uimm12_scaled<16>; + +class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, list<dag> pattern> + : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> { + bits<5> Rt; + + bits<5> Rn; + bits<12> offset; + + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b01; + let Inst{23-22} = opc; + let Inst{21-10} = offset; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeUnsignedLdStInstruction"; +} + +multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + Operand indextype, string asm, list<dag> pattern> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt), + (ins GPR64sp:$Rn, indextype:$offset), + asm, pattern>, + Sched<[WriteLD]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + Operand indextype, string asm, list<dag> pattern> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def ui : BaseLoadStoreUI<sz, V, opc, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset), + asm, pattern>, + Sched<[WriteST]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +def PrefetchOperand : AsmOperandClass { + let Name = "Prefetch"; + let ParserMethod = "tryParsePrefetch"; +} +def prfop : Operand<i32> { + let PrintMethod = "printPrefetchOp"; + let ParserMatchClass = PrefetchOperand; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat> + : BaseLoadStoreUI<sz, V, opc, + (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset), + asm, pat>, + Sched<[WriteLD]>; + +//--- +// Load literal +//--- + +// Load literal address: 19-bit immediate. The low two bits of the target +// offset are implied zero and so are not part of the immediate. +def am_ldrlit : Operand<OtherVT> { + let EncoderMethod = "getLoadLiteralOpValue"; + let DecoderMethod = "DecodePCRelLabel19"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = PCRelLabel19Operand; +} + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm> + : I<(outs regtype:$Rt), (ins am_ldrlit:$label), + asm, "\t$Rt, $label", "", []>, + Sched<[WriteLD]> { + bits<5> Rt; + bits<19> label; + let Inst{31-30} = opc; + let Inst{29-27} = 0b011; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-5} = label; + let Inst{4-0} = Rt; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat> + : I<(outs), (ins prfop:$Rt, am_ldrlit:$label), + asm, "\t$Rt, $label", "", pat>, + Sched<[WriteLD]> { + bits<5> Rt; + bits<19> label; + let Inst{31-30} = opc; + let Inst{29-27} = 0b011; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-5} = label; + let Inst{4-0} = Rt; +} + +//--- +// Load/store register offset +//--- + +def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>; +def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>; +def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>; +def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>; +def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>; + +def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>; +def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>; +def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>; +def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>; +def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>; + +class MemExtendOperand<string Reg, int Width> : AsmOperandClass { + let Name = "Mem" # Reg # "Extend" # Width; + let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">"; + let RenderMethod = "addMemExtendOperands"; + let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width; +} + +def MemWExtend8Operand : MemExtendOperand<"W", 8> { + // The address "[x0, x1, lsl #0]" actually maps to the variant which performs + // the trivial shift. + let RenderMethod = "addMemExtend8Operands"; +} +def MemWExtend16Operand : MemExtendOperand<"W", 16>; +def MemWExtend32Operand : MemExtendOperand<"W", 32>; +def MemWExtend64Operand : MemExtendOperand<"W", 64>; +def MemWExtend128Operand : MemExtendOperand<"W", 128>; + +def MemXExtend8Operand : MemExtendOperand<"X", 8> { + // The address "[x0, x1, lsl #0]" actually maps to the variant which performs + // the trivial shift. + let RenderMethod = "addMemExtend8Operands"; +} +def MemXExtend16Operand : MemExtendOperand<"X", 16>; +def MemXExtend32Operand : MemExtendOperand<"X", 32>; +def MemXExtend64Operand : MemExtendOperand<"X", 64>; +def MemXExtend128Operand : MemExtendOperand<"X", 128>; + +class ro_extend<AsmOperandClass ParserClass, string Reg, int Width> + : Operand<i32> { + let ParserMatchClass = ParserClass; + let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">"; + let DecoderMethod = "DecodeMemExtend"; + let EncoderMethod = "getMemExtendOpValue"; + let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift); +} + +def ro_Wextend8 : ro_extend<MemWExtend8Operand, "w", 8>; +def ro_Wextend16 : ro_extend<MemWExtend16Operand, "w", 16>; +def ro_Wextend32 : ro_extend<MemWExtend32Operand, "w", 32>; +def ro_Wextend64 : ro_extend<MemWExtend64Operand, "w", 64>; +def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>; + +def ro_Xextend8 : ro_extend<MemXExtend8Operand, "x", 8>; +def ro_Xextend16 : ro_extend<MemXExtend16Operand, "x", 16>; +def ro_Xextend32 : ro_extend<MemXExtend32Operand, "x", 32>; +def ro_Xextend64 : ro_extend<MemXExtend64Operand, "x", 64>; +def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>; + +class ROAddrMode<ComplexPattern windex, ComplexPattern xindex, + Operand wextend, Operand xextend> { + // CodeGen-level pattern covering the entire addressing mode. + ComplexPattern Wpat = windex; + ComplexPattern Xpat = xindex; + + // Asm-level Operand covering the valid "uxtw #3" style syntax. + Operand Wext = wextend; + Operand Xext = xextend; +} + +def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>; +def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>; +def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>; +def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>; +def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128, + ro_Xextend128>; + +class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list<dag> pat> + : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +class ROInstAlias<string asm, RegisterClass regtype, Instruction INST> + : InstAlias<asm # "\t$Rt, [$Rn, $Rm]", + (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; + +multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore8RO<sz, V, opc, regtype, asm, + (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend8:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore8RO<sz, V, opc, regtype, asm, + (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend8:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list<dag> pat> + : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list<dag> pat> + : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list<dag> pat> + : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list<dag> pat> + : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt), + (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend), + [(set (Ty regtype:$Rt), + (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend)))]>, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend128:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend), + [(storeop (Ty regtype:$Rt), + (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend128:$extend))]>, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins, + string asm, list<dag> pat> + : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>, + Sched<[WriteLD]> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> { + def roW : BasePrefetchRO<sz, V, opc, (outs), + (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend), + asm, [(AArch64Prefetch imm:$Rt, + (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))]> { + let Inst{13} = 0b0; + } + + def roX : BasePrefetchRO<sz, V, opc, (outs), + (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend), + asm, [(AArch64Prefetch imm:$Rt, + (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))]> { + let Inst{13} = 0b1; + } + + def : InstAlias<"prfm $Rt, [$Rn, $Rm]", + (!cast<Instruction>(NAME # "roX") prfop:$Rt, + GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; +} + +//--- +// Load/store unscaled immediate +//--- + +def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>; +def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>; +def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>; +def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>; +def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>; + +class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, list<dag> pattern> + : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, list<dag> pattern> { + let AddedComplexity = 1 in // try this before LoadUI + def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt), + (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>, + Sched<[WriteLD]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, list<dag> pattern> { + let AddedComplexity = 1 in // try this before StoreUI + def i : BaseLoadStoreUnscale<sz, V, opc, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), + asm, pattern>, + Sched<[WriteST]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, + list<dag> pat> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in + def i : BaseLoadStoreUnscale<sz, V, opc, (outs), + (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset), + asm, pat>, + Sched<[WriteLD]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store unscaled immediate, unprivileged +//--- + +class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc, + dag oops, dag iops, string asm> + : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, + RegisterClass regtype, string asm> { + let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in + def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt), + (ins GPR64sp:$Rn, simm9:$offset), asm>, + Sched<[WriteLD]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, + RegisterClass regtype, string asm> { + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in + def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs), + (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), + asm>, + Sched<[WriteST]>; + + def : InstAlias<asm # "\t$Rt, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store pre-indexed +//--- + +class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, string cstr, list<dag> pat> + : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b11; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm> + : BaseLoadStorePreIdx<sz, V, opc, + (outs GPR64sp:$wback, regtype:$Rt), + (ins GPR64sp:$Rn, simm9:$offset), asm, + "$Rn = $wback,@earlyclobber $wback", []>, + Sched<[WriteLD, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, SDPatternOperator storeop, ValueType Ty> + : BaseLoadStorePreIdx<sz, V, opc, + (outs GPR64sp:$wback), + (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), + asm, "$Rn = $wback,@earlyclobber $wback", + [(set GPR64sp:$wback, + (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>, + Sched<[WriteAdr, WriteST]>; +} // hasSideEffects = 0 + +//--- +// Load/store post-indexed +//--- + +class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, string cstr, list<dag> pat> + : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0b0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm> + : BaseLoadStorePostIdx<sz, V, opc, + (outs GPR64sp:$wback, regtype:$Rt), + (ins GPR64sp:$Rn, simm9:$offset), + asm, "$Rn = $wback,@earlyclobber $wback", []>, + Sched<[WriteLD, WriteI]>; + +let mayStore = 1, mayLoad = 0 in +class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, SDPatternOperator storeop, ValueType Ty> + : BaseLoadStorePostIdx<sz, V, opc, + (outs GPR64sp:$wback), + (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), + asm, "$Rn = $wback,@earlyclobber $wback", + [(set GPR64sp:$wback, + (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>, + Sched<[WriteAdr, WriteST, ReadAdrBase]>; +} // hasSideEffects = 0 + + +//--- +// Load/store pair +//--- + +// (indexed, offset) + +class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops, + string asm> + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b010; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in + def i : BaseLoadStorePairOffset<opc, V, 1, + (outs regtype:$Rt, regtype:$Rt2), + (ins GPR64sp:$Rn, indextype:$offset), asm>, + Sched<[WriteLD, WriteLDHi]>; + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + + +multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in + def i : BaseLoadStorePairOffset<opc, V, 0, (outs), + (ins regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, indextype:$offset), + asm>, + Sched<[WriteSTP]>; + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +// (pre-indexed) +class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops, + string asm> + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b011; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> + : BaseLoadStorePairPreIdx<opc, V, 1, + (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2), + (ins GPR64sp:$Rn, indextype:$offset), asm>, + Sched<[WriteLD, WriteLDHi, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> + : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback), + (ins regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, indextype:$offset), + asm>, + Sched<[WriteAdr, WriteSTP]>; +} // hasSideEffects = 0 + +// (post-indexed) + +class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops, + string asm> + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b001; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype, + Operand idxtype, string asm> + : BaseLoadStorePairPostIdx<opc, V, 1, + (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2), + (ins GPR64sp:$Rn, idxtype:$offset), asm>, + Sched<[WriteLD, WriteLDHi, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype, + Operand idxtype, string asm> + : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback), + (ins regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, idxtype:$offset), + asm>, + Sched<[WriteAdr, WriteSTP]>; +} // hasSideEffects = 0 + +// (no-allocate) + +class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops, + string asm> + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b000; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in + def i : BaseLoadStorePairNoAlloc<opc, V, 1, + (outs regtype:$Rt, regtype:$Rt2), + (ins GPR64sp:$Rn, indextype:$offset), asm>, + Sched<[WriteLD, WriteLDHi]>; + + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in + def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs), + (ins regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, indextype:$offset), + asm>, + Sched<[WriteSTP]>; + + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", + (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store exclusive +//--- + +// True exclusive operations write to and/or read from the system's exclusive +// monitors, which as far as a compiler is concerned can be modelled as a +// random shared memory address. Hence LoadExclusive mayStore. +// +// Since these instructions have the undefined register bits set to 1 in +// their canonical form, we need a post encoder method to set those bits +// to 1 when encoding these instructions. We do this using the +// fixLoadStoreExclusive function. This function has template parameters: +// +// fixLoadStoreExclusive<int hasRs, int hasRt2> +// +// hasRs indicates that the instruction uses the Rs field, so we won't set +// it to 1 (and the same for Rt2). We don't need template parameters for +// the other register fields since Rt and Rn are always used. +// +let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in +class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0, + dag oops, dag iops, string asm, string operands> + : I<oops, iops, asm, operands, "", []> { + let Inst{31-30} = sz; + let Inst{29-24} = 0b001000; + let Inst{23} = o2; + let Inst{22} = L; + let Inst{21} = o1; + let Inst{15} = o0; + + let DecoderMethod = "DecodeExclusiveLdStInstruction"; +} + +// Neither Rs nor Rt2 operands. +class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0, + dag oops, dag iops, string asm, string operands> + : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> { + bits<5> Rt; + bits<5> Rn; + let Inst{20-16} = 0b11111; + let Unpredictable{20-16} = 0b11111; + let Inst{14-10} = 0b11111; + let Unpredictable{14-10} = 0b11111; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let PostEncoderMethod = "fixLoadStoreExclusive<0,0>"; +} + +// Simple load acquires don't set the exclusive monitor +let mayLoad = 1, mayStore = 0 in +class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt), + (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">, + Sched<[WriteLD]>; + +class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt), + (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">, + Sched<[WriteLD]>; + +class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive<sz, o2, L, o1, o0, + (outs regtype:$Rt, regtype:$Rt2), + (ins GPR64sp0:$Rn), asm, + "\t$Rt, $Rt2, [$Rn]">, + Sched<[WriteLD, WriteLDHi]> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let PostEncoderMethod = "fixLoadStoreExclusive<0,1>"; +} + +// Simple store release operations do not check the exclusive monitor. +let mayLoad = 0, mayStore = 1 in +class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs), + (ins regtype:$Rt, GPR64sp0:$Rn), + asm, "\t$Rt, [$Rn]">, + Sched<[WriteST]>; + +let mayLoad = 1, mayStore = 1 in +class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws), + (ins regtype:$Rt, GPR64sp0:$Rn), + asm, "\t$Ws, $Rt, [$Rn]">, + Sched<[WriteSTX]> { + bits<5> Ws; + bits<5> Rt; + bits<5> Rn; + let Inst{20-16} = Ws; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Constraints = "@earlyclobber $Ws"; + let PostEncoderMethod = "fixLoadStoreExclusive<1,0>"; +} + +class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive<sz, o2, L, o1, o0, + (outs GPR32:$Ws), + (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn), + asm, "\t$Ws, $Rt, $Rt2, [$Rn]">, + Sched<[WriteSTX]> { + bits<5> Ws; + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + let Inst{20-16} = Ws; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Constraints = "@earlyclobber $Ws"; +} + +//--- +// Exception generation +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm> + : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, + Sched<[WriteSys]> { + bits<16> imm; + let Inst{31-24} = 0b11010100; + let Inst{23-21} = op1; + let Inst{20-5} = imm; + let Inst{4-2} = 0b000; + let Inst{1-0} = ll; +} + +let Predicates = [HasFPARMv8] in { + +//--- +// Floating point to integer conversion +//--- + +class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + string asm, list<dag> pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn), + asm, "\t$Rd, $Rn", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-29} = 0b00; + let Inst{28-24} = 0b11110; + let Inst{23-22} = type; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + Operand immType, string asm, list<dag> pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), + asm, "\t$Rd, $Rn, $scale", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-29} = 0b00; + let Inst{28-24} = 0b11110; + let Inst{23-22} = type; + let Inst{21} = 0; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = scale; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, + SDPatternOperator OpN> { + // Unscaled half-precision to 32-bit + def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Unscaled half-precision to 64-bit + def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Unscaled single-precision to 32-bit + def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + } + + // Unscaled single-precision to 64-bit + def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR32:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + } + + // Unscaled double-precision to 32-bit + def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm, + [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> { + let Inst{31} = 0; // 32-bit GPR flag + } + + // Unscaled double-precision to 64-bit + def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm, + [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } +} + +multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, + SDPatternOperator OpN> { + // Scaled half-precision to 32-bit + def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, + fixedpoint_f16_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + + // Scaled half-precision to 64-bit + def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, + fixedpoint_f16_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Scaled single-precision to 32-bit + def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, + fixedpoint_f32_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn, + fixedpoint_f32_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + } + + // Scaled single-precision to 64-bit + def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64, + fixedpoint_f32_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn, + fixedpoint_f32_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } + + // Scaled double-precision to 32-bit + def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32, + fixedpoint_f64_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn, + fixedpoint_f64_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + } + + // Scaled double-precision to 64-bit + def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64, + fixedpoint_f64_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn, + fixedpoint_f64_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } +} + +//--- +// Integer to floating point conversion +//--- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseIntegerToFP<bit isUnsigned, + RegisterClass srcType, RegisterClass dstType, + Operand immType, string asm, list<dag> pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), + asm, "\t$Rd, $Rn, $scale", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-24} = 0b0011110; + let Inst{21-17} = 0b00001; + let Inst{16} = isUnsigned; + let Inst{15-10} = scale; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseIntegerToFPUnscaled<bit isUnsigned, + RegisterClass srcType, RegisterClass dstType, + ValueType dvt, string asm, SDNode node> + : I<(outs dstType:$Rd), (ins srcType:$Rn), + asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-24} = 0b0011110; + let Inst{21-17} = 0b10001; + let Inst{16} = isUnsigned; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { + // Unscaled + def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + } + + def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + } + + def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + // Scaled + def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm, + [(set FPR16:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f16_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + + def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm, + [(set FPR32:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f32_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + let scale{5} = 1; + } + + def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm, + [(set FPR64:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f64_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + let scale{5} = 1; + } + + def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm, + [(set FPR16:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f16_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm, + [(set FPR32:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f32_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + } + + def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm, + [(set FPR64:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f64_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } +} + +//--- +// Unscaled integer <-> floating point conversion (i.e. FMOV) +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + string asm> + : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", + // We use COPY_TO_REGCLASS for these bitconvert operations. + // copyPhysReg() expands the resultant COPY instructions after + // regalloc is done. This gives greater freedom for the allocator + // and related passes (coalescing, copy propagation, et. al.) to + // be more effective. + [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-24} = 0b0011110; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterOperand dstType, string asm, + string kind> + : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, + "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-23} = 0b00111101; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeFMOVLaneInstruction"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode, + RegisterOperand srcType, RegisterClass dstType, string asm, + string kind> + : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, + "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-23} = 0b00111101; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeFMOVLaneInstruction"; +} + + +multiclass UnscaledConversion<string asm> { + def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + } + + def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag + } + + def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, + asm, ".d"> { + let Inst{31} = 1; + let Inst{22} = 0; + } + + def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64, + asm, ".d"> { + let Inst{31} = 1; + let Inst{22} = 0; + } +} + +//--- +// Floating point conversion +//--- + +class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType, + RegisterClass srcType, string asm, list<dag> pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-24} = 0b00011110; + let Inst{23-22} = type; + let Inst{21-17} = 0b10001; + let Inst{16-15} = opcode; + let Inst{14-10} = 0b10000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPConversion<string asm> { + // Double-precision to Half-precision + def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, + [(set FPR16:$Rd, (fround FPR64:$Rn))]>; + + // Double-precision to Single-precision + def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, + [(set FPR32:$Rd, (fround FPR64:$Rn))]>; + + // Half-precision to Double-precision + def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, + [(set FPR64:$Rd, (fextend FPR16:$Rn))]>; + + // Half-precision to Single-precision + def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, + [(set FPR32:$Rd, (fextend FPR16:$Rn))]>; + + // Single-precision to Double-precision + def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, + [(set FPR64:$Rd, (fextend FPR32:$Rn))]>; + + // Single-precision to Half-precision + def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, + [(set FPR16:$Rd, (fround FPR32:$Rn))]>; +} + +//--- +// Single operand floating point data processing +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, + ValueType vt, string asm, SDPatternOperator node> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", + [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-24} = 0b00011110; + let Inst{21-19} = 0b100; + let Inst{18-15} = opcode; + let Inst{14-10} = 0b10000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SingleOperandFPData<bits<4> opcode, string asm, + SDPatternOperator node = null_frag> { + def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + + def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> { + let Inst{23-22} = 0b00; // 32-bit size flag + } + + def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> { + let Inst{23-22} = 0b01; // 64-bit size flag + } +} + +//--- +// Two operand floating point data processing +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, + string asm, list<dag> pat> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pat>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass TwoOperandFPData<bits<4> opcode, string asm, + SDPatternOperator node = null_frag> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set (f16 FPR16:$Rd), + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, + [(set (f32 FPR32:$Rd), + (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> { + let Inst{23-22} = 0b00; // 32-bit size flag + } + + def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, + [(set (f64 FPR64:$Rd), + (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> { + let Inst{23-22} = 0b01; // 64-bit size flag + } +} + +multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, + [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> { + let Inst{23-22} = 0b00; // 32-bit size flag + } + + def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, + [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> { + let Inst{23-22} = 0b01; // 64-bit size flag + } +} + + +//--- +// Three operand floating point data processing +//--- + +class BaseThreeOperandFPData<bit isNegated, bit isSub, + RegisterClass regtype, string asm, list<dag> pat> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra), + asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>, + Sched<[WriteFMul]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<5> Ra; + let Inst{31-24} = 0b00011111; + let Inst{21} = isNegated; + let Inst{20-16} = Rm; + let Inst{15} = isSub; + let Inst{14-10} = Ra; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, + SDPatternOperator node> { + def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm, + [(set FPR16:$Rd, + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + + def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm, + [(set FPR32:$Rd, + (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> { + let Inst{23-22} = 0b00; // 32-bit size flag + } + + def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm, + [(set FPR64:$Rd, + (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> { + let Inst{23-22} = 0b01; // 64-bit size flag + } +} + +//--- +// Floating point data comparisons +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseOneOperandFPComparison<bit signalAllNans, + RegisterClass regtype, string asm, + list<dag> pat> + : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, + Sched<[WriteFCmp]> { + bits<5> Rn; + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + + let Inst{15-10} = 0b001000; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = 0b1000; + + // Rm should be 0b00000 canonically, but we need to accept any value. + let PostEncoderMethod = "fixOneOperandFPComparison"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, + string asm, list<dag> pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>, + Sched<[WriteFCmp]> { + bits<5> Rm; + bits<5> Rn; + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-10} = 0b001000; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = 0b0000; +} + +multiclass FPComparison<bit signalAllNans, string asm, + SDPatternOperator OpNode = null_frag> { + let Defs = [NZCV] in { + def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm, + [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b00; + } + + def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm, + [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b00; + } + + def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm, + [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b01; + } + + def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm, + [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b01; + } + } // Defs = [NZCV] +} + +//--- +// Floating point conditional comparisons +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, + string mnemonic, list<dag> pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, + Sched<[WriteFCmp]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + + bits<5> Rn; + bits<5> Rm; + bits<4> nzcv; + bits<4> cond; + + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = nzcv; +} + +multiclass FPCondComparison<bit signalAllNans, string mnemonic, + SDPatternOperator OpNode = null_frag> { + def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic, + [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b00; + } + + def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic, + [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b01; + } +} + +//--- +// Floating point conditional select +//--- + +class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel (vt regtype:$Rn), regtype:$Rm, + (i32 imm:$cond), NZCV))]>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b11; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPCondSelect<string asm> { + let Uses = [NZCV] in { + def Hrrr : BaseFPCondSelect<FPR16, f16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Srrr : BaseFPCondSelect<FPR32, f32, asm> { + let Inst{23-22} = 0b00; + } + + def Drrr : BaseFPCondSelect<FPR64, f64, asm> { + let Inst{23-22} = 0b01; + } + } // Uses = [NZCV] +} + +//--- +// Floating move immediate +//--- + +class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> + : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "", + [(set regtype:$Rd, fpimmtype:$imm)]>, + Sched<[WriteFImm]> { + bits<5> Rd; + bits<8> imm; + let Inst{31-24} = 0b00011110; + let Inst{21} = 1; + let Inst{20-13} = imm; + let Inst{12-5} = 0b10000000; + let Inst{4-0} = Rd; +} + +multiclass FPMoveImmediate<string asm> { + def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> { + let Inst{23-22} = 0b00; + } + + def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> { + let Inst{23-22} = 0b01; + } +} +} // end of 'let Predicates = [HasFPARMv8]' + +//---------------------------------------------------------------------------- +// AdvSIMD +//---------------------------------------------------------------------------- + +let Predicates = [HasNEON] in { + +//---------------------------------------------------------------------------- +// AdvSIMD three register vector instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode, + RegisterOperand regtype, string asm, string kind, + list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-21} = size; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, + RegisterOperand regtype, string asm, string kind, + list<dag> pattern> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-21} = size; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// All operand sizes distinguished in the encoding. +multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; + def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128, + asm, ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; +} + +// As above, but D sized elements unsupported. +multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, + asm, ".8b", + [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, + asm, ".16b", + [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, + asm, ".4h", + [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, + asm, ".8h", + [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, + asm, ".2s", + [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, + asm, ".4s", + [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; +} + +multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; + def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; +} + +// As above, but only B sized elements supported. +multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), + (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; +} + +// As above, but only floating point elements supported. +multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, + string asm, SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, + asm, ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, + asm, ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, + asm, ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, + string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, + asm, ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, + string asm, SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$dst), + (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$dst), + (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64, + asm, ".2s", + [(set (v2f32 V64:$dst), + (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128, + asm, ".4s", + [(set (v4f32 V128:$dst), + (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128, + asm, ".2d", + [(set (v2f64 V128:$dst), + (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +// As above, but D and B sized elements unsupported. +multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; +} + +// Logical three vector ops share opcode bits, and only use B sized elements. +multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; + + def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>; +} + +multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, + string asm, SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, + asm, ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128, + asm, ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]>; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast<Instruction>(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast<Instruction>(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + + +//---------------------------------------------------------------------------- +// AdvSIMD two register vector instructions. +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, + bits<2> size2, RegisterOperand regtype, string asm, + string dstkind, string srckind, list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # + "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, + bits<2> size2, RegisterOperand regtype, + string asm, string dstkind, string srckind, + list<dag> pattern> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # + "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// Supports B, H, and S element sizes. +multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + +class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size, + RegisterOperand regtype, string asm, string dstkind, + string srckind, string amount> + : I<(outs V128:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount # + "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-24} = 0b101110; + let Inst{23-22} = size; + let Inst{21-10} = 0b100001001110; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDVectorLShiftLongBySizeBHS { + let hasSideEffects = 0 in { + def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, + "shll", ".8h", ".8b", "8">; + def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, + "shll2", ".8h", ".16b", "8">; + def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64, + "shll", ".4s", ".4h", "16">; + def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128, + "shll2", ".4s", ".8h", "16">; + def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64, + "shll", ".2d", ".2s", "32">; + def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128, + "shll2", ".2d", ".4s", "32">; + } +} + +// Supports all element sizes. +multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, + asm, ".4h", ".8b", + [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, + asm, ".8h", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, + asm, ".2s", ".4h", + [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, + asm, ".4s", ".8h", + [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, + asm, ".1d", ".2s", + [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, + asm, ".2d", ".4s", + [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + +multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, + asm, ".4h", ".8b", + [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), + (v8i8 V64:$Rn)))]>; + def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, + asm, ".8h", ".16b", + [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), + (v16i8 V128:$Rn)))]>; + def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, + asm, ".2s", ".4h", + [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), + (v4i16 V64:$Rn)))]>; + def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, + asm, ".4s", ".8h", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v8i16 V128:$Rn)))]>; + def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, + asm, ".1d", ".2s", + [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), + (v2i32 V64:$Rn)))]>; + def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, + asm, ".2d", ".4s", + [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), + (v4i32 V128:$Rn)))]>; +} + +// Supports all element sizes, except 1xD. +multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; + def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; +} + +multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; +} + + +// Supports only B element sizes. +multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + +} + +// Supports only B and H element sizes. +multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; +} + +// Supports only S and D element sizes, uses high bit of the size field +// as an extra opcode bit. +multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; +} + +// Supports only S element size. +multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + + +multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; +} + +multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; +} + + +class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterOperand inreg, RegisterOperand outreg, + string asm, string outkind, string inkind, + list<dag> pattern> + : I<(outs outreg:$Rd), (ins inreg:$Rn), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind # + "|" # outkind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterOperand inreg, RegisterOperand outreg, + string asm, string outkind, string inkind, + list<dag> pattern> + : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind # + "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64, + asm, ".8b", ".8h", + [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128, + asm#"2", ".16b", ".8h", []>; + def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64, + asm, ".4h", ".4s", + [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128, + asm#"2", ".8h", ".4s", []>; + def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64, + asm, ".2s", ".2d", + [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>; + def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; + + def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))), + (!cast<Instruction>(NAME # "v16i8") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))), + (!cast<Instruction>(NAME # "v8i16") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))), + (!cast<Instruction>(NAME # "v4i32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +} + +class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2, + bits<5> opcode, RegisterOperand regtype, string asm, + string kind, string zero, ValueType dty, + ValueType sty, SDNode OpNode> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # + "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", + [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// Comparisons support all element sizes, except 1xD. +multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm, + SDNode OpNode> { + def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, + asm, ".8b", "0", + v8i8, v8i8, OpNode>; + def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128, + asm, ".16b", "0", + v16i8, v16i8, OpNode>; + def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64, + asm, ".4h", "0", + v4i16, v4i16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128, + asm, ".8h", "0", + v8i16, v8i16, OpNode>; + def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64, + asm, ".2s", "0", + v2i32, v2i32, OpNode>; + def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128, + asm, ".4s", "0", + v4i32, v4i32, OpNode>; + def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128, + asm, ".2d", "0", + v2i64, v2i64, OpNode>; +} + +// FP Comparisons support only S and D element sizes (and H for v8.2a). +multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, + string asm, SDNode OpNode> { + + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, + asm, ".4h", "0.0", + v4i16, v4f16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128, + asm, ".8h", "0.0", + v8i16, v8f16, OpNode>; + } // Predicates = [HasNEON, HasFullFP16] + def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64, + asm, ".2s", "0.0", + v2i32, v2f32, OpNode>; + def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128, + asm, ".4s", "0.0", + v4i32, v4f32, OpNode>; + def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, + asm, ".2d", "0.0", + v2i64, v2f64, OpNode>; + + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0", + (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0", + (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0", + (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # ".4h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # ".8h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # ".2s\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # ".4s\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; + def : InstAlias<asm # ".2d\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterOperand outtype, RegisterOperand intype, + string asm, string VdTy, string VnTy, + list<dag> pattern> + : I<(outs outtype:$Rd), (ins intype:$Rn), asm, + !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterOperand outtype, RegisterOperand intype, + string asm, string VdTy, string VnTy, + list<dag> pattern> + : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm, + !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> { + def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64, + asm, ".4s", ".4h", []>; + def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128, + asm#"2", ".4s", ".8h", []>; + def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64, + asm, ".2d", ".2s", []>; + def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128, + asm#"2", ".2d", ".4s", []>; +} + +multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> { + def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128, + asm, ".4h", ".4s", []>; + def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128, + asm#"2", ".8h", ".4s", []>; + def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, + asm, ".2s", ".2d", []>; + def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; +} + +multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm, + Intrinsic OpNode> { + def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, + asm, ".2s", ".2d", + [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>; + def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; + + def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))), + (!cast<Instruction>(NAME # "v4f32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD three register different-size vector instructions. +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode, + RegisterOperand outtype, RegisterOperand intype1, + RegisterOperand intype2, string asm, + string outkind, string inkind1, string inkind2, + list<dag> pattern> + : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # + "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode, + RegisterOperand outtype, RegisterOperand intype1, + RegisterOperand intype2, string asm, + string outkind, string inkind1, string inkind2, + list<dag> pattern> + : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # + "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// FIXME: TableGen doesn't know how to deal with expanded types that also +// change the element count (in this case, placing the results in +// the high elements of the result register rather than the low +// elements). Until that's fixed, we can't code-gen those. +multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm, + Intrinsic IntOp> { + def v8i16_v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc, + V64, V128, V128, + asm, ".8b", ".8h", ".8h", + [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".16b", ".8h", ".8h", + []>; + def v4i32_v4i16 : BaseSIMDDifferentThreeVector<U, 0b010, opc, + V64, V128, V128, + asm, ".4h", ".4s", ".4s", + [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; + def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".8h", ".4s", ".4s", + []>; + def v2i64_v2i32 : BaseSIMDDifferentThreeVector<U, 0b100, opc, + V64, V128, V128, + asm, ".2s", ".2d", ".2d", + [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; + def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".4s", ".2d", ".2d", + []>; + + + // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in + // a version attached to an instruction. + def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), + (v8i16 V128:$Rm))), + (!cast<Instruction>(NAME # "v8i16_v16i8") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), + (v4i32 V128:$Rm))), + (!cast<Instruction>(NAME # "v4i32_v8i16") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), + (v2i64 V128:$Rm))), + (!cast<Instruction>(NAME # "v2i64_v4i32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +} + +multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm, + Intrinsic IntOp> { + def v8i8 : BaseSIMDDifferentThreeVector<U, 0b000, opc, + V128, V64, V64, + asm, ".8h", ".8b", ".8b", + [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDDifferentThreeVector<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".16b", ".16b", []>; + let Predicates = [HasCrypto] in { + def v1i64 : BaseSIMDDifferentThreeVector<U, 0b110, opc, + V128, V64, V64, + asm, ".1q", ".1d", ".1d", []>; + def v2i64 : BaseSIMDDifferentThreeVector<U, 0b111, opc, + V128, V128, V128, + asm#"2", ".1q", ".2d", ".2d", []>; + } + + def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), + (v8i8 (extract_high_v16i8 V128:$Rm)))), + (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>; +} + +multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))]>; +} + +multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, + V128, V64, V64, + asm, ".8h", ".8b", ".8b", + [(set (v8i16 V128:$Rd), + (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$Rd), + (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm)))))]>; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$Rd), + (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$Rd), + (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$Rd), + (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$Rd), + (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))))]>; +} + +multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc, + string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc, + V128, V64, V64, + asm, ".8h", ".8b", ".8b", + [(set (v8i16 V128:$dst), + (add (v8i16 V128:$Rd), + (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>; + def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$dst), + (add (v8i16 V128:$Rd), + (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm))))))]>; + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$dst), + (add (v4i32 V128:$Rd), + (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$dst), + (add (v4i32 V128:$Rd), + (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm))))))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$dst), + (add (v2i64 V128:$Rd), + (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$dst), + (add (v2i64 V128:$Rd), + (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm))))))]>; +} + +multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, + V128, V64, V64, + asm, ".8h", ".8b", ".8b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm)))]>; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))]>; +} + +multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc, + string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc, + V128, V64, V64, + asm, ".8h", ".8b", ".8b", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".16b", ".16b", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), + (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm)))]>; + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), + (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), + (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))]>; +} + +multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm, + SDPatternOperator Accum> { + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc, + V128, V64, V64, + asm, ".4s", ".4h", ".4h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), + (v4i16 V64:$Rm)))))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".8h", ".8h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc, + V128, V64, V64, + asm, ".2d", ".2s", ".2s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn), + (v2i32 V64:$Rm)))))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".4s", ".4s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))))]>; +} + +multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b000, opc, + V128, V128, V64, + asm, ".8h", ".8h", ".8b", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector<U, 0b001, opc, + V128, V128, V128, + asm#"2", ".8h", ".8h", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), + (extract_high_v16i8 V128:$Rm)))]>; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b010, opc, + V128, V128, V64, + asm, ".4s", ".4s", ".4h", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector<U, 0b011, opc, + V128, V128, V128, + asm#"2", ".4s", ".4s", ".8h", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), + (extract_high_v8i16 V128:$Rm)))]>; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b100, opc, + V128, V128, V64, + asm, ".2d", ".2d", ".2s", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector<U, 0b101, opc, + V128, V128, V128, + asm#"2", ".2d", ".2d", ".4s", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), + (extract_high_v4i32 V128:$Rm)))]>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD bitwise extract from vector +//---------------------------------------------------------------------------- + +class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty, + string asm, string kind> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" # + "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "", + [(set (vty regtype:$Rd), + (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> imm; + let Inst{31} = 0; + let Inst{30} = size; + let Inst{29-21} = 0b101110000; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-11} = imm; + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +multiclass SIMDBitwiseExtract<string asm> { + def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> { + let imm{3} = 0; + } + def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">; +} + +//---------------------------------------------------------------------------- +// AdvSIMD zip vector +//---------------------------------------------------------------------------- + +class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype, + string asm, string kind, SDNode OpNode, ValueType valty> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm}", "", + [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29-24} = 0b001110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDZipVector<bits<3>opc, string asm, + SDNode OpNode> { + def v8i8 : BaseSIMDZipVector<0b000, opc, V64, + asm, ".8b", OpNode, v8i8>; + def v16i8 : BaseSIMDZipVector<0b001, opc, V128, + asm, ".16b", OpNode, v16i8>; + def v4i16 : BaseSIMDZipVector<0b010, opc, V64, + asm, ".4h", OpNode, v4i16>; + def v8i16 : BaseSIMDZipVector<0b011, opc, V128, + asm, ".8h", OpNode, v8i16>; + def v2i32 : BaseSIMDZipVector<0b100, opc, V64, + asm, ".2s", OpNode, v2i32>; + def v4i32 : BaseSIMDZipVector<0b101, opc, V128, + asm, ".4s", OpNode, v4i32>; + def v2i64 : BaseSIMDZipVector<0b111, opc, V128, + asm, ".2d", OpNode, v2i64>; + + def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)), + (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>; + def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)), + (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>; + def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), + (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>; + def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), + (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>; + def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)), + (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD three register scalar instructions +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode, + RegisterClass regtype, string asm, + list<dag> pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-21} = size; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode, + dag oops, dag iops, string asm, + list<dag> pattern> + : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = R; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, + [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; +} + +multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, + [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; + def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>; + + def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; + def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))), + (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, + [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; +} + +multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst), + (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), + asm, []>; + def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst), + (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), + asm, []>; +} + +multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, + [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, + [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>; + } // Predicates = [HasNEON, HasFullFP16] + } + + def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, + [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, + [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + []>; + } // Predicates = [HasNEON, HasFullFP16] + } + + def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; +} + +class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode, + dag oops, dag iops, string asm, string cstr, list<dag> pat> + : I<oops, iops, asm, + "\t$Rd, $Rn, $Rm", cstr, pat>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc, + (outs FPR32:$Rd), + (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>; + def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc, + (outs FPR64:$Rd), + (ins FPR32:$Rn, FPR32:$Rm), asm, "", + [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def i16 : BaseSIMDThreeScalarMixed<U, 0b01, opc, + (outs FPR32:$dst), + (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm), + asm, "$Rd = $dst", []>; + def i32 : BaseSIMDThreeScalarMixed<U, 0b10, opc, + (outs FPR64:$dst), + (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm), + asm, "$Rd = $dst", + [(set (i64 FPR64:$dst), + (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD two register scalar instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, + RegisterClass regtype, RegisterClass regtype2, + string asm, list<dag> pat> + : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, + "\t$Rd, $Rn", "", pat>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode, + RegisterClass regtype, RegisterClass regtype2, + string asm, list<dag> pat> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm, + "\t$Rd, $Rn", "$Rd = $dst", pat>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, + RegisterClass regtype, string asm, string zero> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "\t$Rd, $Rn, #" # zero, "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm> + : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", + [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-17} = 0b011111100110000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">; + + def : Pat<(v1i64 (OpNode FPR64:$Rn)), + (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; +} + +multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">; + def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">; + } + + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>; + } + + def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), + (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; +} + +multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, + [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>; + + def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), + (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>; +} + +multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> { + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>; + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>; + } +} + +multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, + [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>; + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm, + [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, + [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>; + } +} + +multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, + [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>; + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm, + [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), + (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>; +} + +multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm, + Intrinsic OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v1i64 : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm, + [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>; + def v1i32 : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm, + [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>; + def v1i16 : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>; + def v1i8 : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))), + (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>; +} + + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm, + [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD scalar pairwise instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode, + RegisterOperand regtype, RegisterOperand vectype, + string asm, string kind> + : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, + "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> { + def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128, + asm, ".2d">; +} + +multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, + asm, ".2h">; + } + def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64, + asm, ".2s">; + def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128, + asm, ".2d">; +} + +//---------------------------------------------------------------------------- +// AdvSIMD across lanes instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterClass regtype, RegisterOperand vectype, + string asm, string kind, list<dag> pattern> + : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, + "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode, + string asm> { + def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64, + asm, ".8b", []>; + def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128, + asm, ".16b", []>; + def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64, + asm, ".4h", []>; + def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128, + asm, ".8h", []>; + def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128, + asm, ".4s", []>; +} + +multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> { + def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64, + asm, ".8b", []>; + def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128, + asm, ".16b", []>; + def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64, + asm, ".4h", []>; + def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128, + asm, ".8h", []>; + def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128, + asm, ".4s", []>; +} + +multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, + Intrinsic intOp> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, + asm, ".4h", + [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, + asm, ".8h", + [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, + asm, ".4s", + [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD INS/DUP instructions +//---------------------------------------------------------------------------- + +// FIXME: There has got to be a better way to factor these. ugh. + +class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm, + string operands, string constraints, list<dag> pattern> + : I<outs, ins, asm, operands, constraints, pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{15} = 0; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype, + RegisterOperand vecreg, RegisterClass regtype> + : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup", + "{\t$Rd" # size # ", $Rn" # + "|" # size # "\t$Rd, $Rn}", "", + [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> { + let Inst{20-16} = imm5; + let Inst{14-11} = 0b0001; +} + +class SIMDDupFromElement<bit Q, string dstkind, string srckind, + ValueType vectype, ValueType insreg, + RegisterOperand vecreg, Operand idxtype, + ValueType elttype, SDNode OpNode> + : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup", + "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" # + "|" # dstkind # "\t$Rd, $Rn$idx}", "", + [(set (vectype vecreg:$Rd), + (OpNode (insreg V128:$Rn), idxtype:$idx))]> { + let Inst{14-11} = 0b0000; +} + +class SIMDDup64FromElement + : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128, + VectorIndexD, i64, AArch64duplane64> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; +} + +class SIMDDup32FromElement<bit Q, string size, ValueType vectype, + RegisterOperand vecreg> + : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg, + VectorIndexS, i64, AArch64duplane32> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; +} + +class SIMDDup16FromElement<bit Q, string size, ValueType vectype, + RegisterOperand vecreg> + : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg, + VectorIndexH, i64, AArch64duplane16> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; +} + +class SIMDDup8FromElement<bit Q, string size, ValueType vectype, + RegisterOperand vecreg> + : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg, + VectorIndexB, i64, AArch64duplane8> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; +} + +class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype, + Operand idxtype, string asm, list<dag> pattern> + : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm, + "{\t$Rd, $Rn" # size # "$idx" # + "|" # size # "\t$Rd, $Rn$idx}", "", pattern> { + let Inst{14-11} = imm4; +} + +class SIMDSMov<bit Q, string size, RegisterClass regtype, + Operand idxtype> + : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>; +class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype, + Operand idxtype> + : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov", + [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>; + +class SIMDMovAlias<string asm, string size, Instruction inst, + RegisterClass regtype, Operand idxtype> + : InstAlias<asm#"{\t$dst, $src"#size#"$idx" # + "|" # size # "\t$dst, $src$idx}", + (inst regtype:$dst, V128:$src, idxtype:$idx)>; + +multiclass SMov { + def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } +} + +multiclass UMov { + def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + def : SIMDMovAlias<"mov", ".s", + !cast<Instruction>(NAME#"vi32"), + GPR32, VectorIndexS>; + def : SIMDMovAlias<"mov", ".d", + !cast<Instruction>(NAME#"vi64"), + GPR64, VectorIndexD>; +} + +class SIMDInsFromMain<string size, ValueType vectype, + RegisterClass regtype, Operand idxtype> + : BaseSIMDInsDup<1, 0, (outs V128:$dst), + (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins", + "{\t$Rd" # size # "$idx, $Rn" # + "|" # size # "\t$Rd$idx, $Rn}", + "$Rd = $dst", + [(set V128:$dst, + (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> { + let Inst{14-11} = 0b0011; +} + +class SIMDInsFromElement<string size, ValueType vectype, + ValueType elttype, Operand idxtype> + : BaseSIMDInsDup<1, 1, (outs V128:$dst), + (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins", + "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" # + "|" # size # "\t$Rd$idx, $Rn$idx2}", + "$Rd = $dst", + [(set V128:$dst, + (vector_insert + (vectype V128:$Rd), + (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)), + idxtype:$idx))]>; + +class SIMDInsMainMovAlias<string size, Instruction inst, + RegisterClass regtype, Operand idxtype> + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # + "|" # size #"\t$dst$idx, $src}", + (inst V128:$dst, idxtype:$idx, regtype:$src)>; +class SIMDInsElementMovAlias<string size, Instruction inst, + Operand idxtype> + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # + # "|" # size #"\t$dst$idx, $src$idx2}", + (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; + + +multiclass SIMDIns { + def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + + def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> { + bits<4> idx; + bits<4> idx2; + let Inst{20-17} = idx; + let Inst{16} = 1; + let Inst{14-11} = idx2; + } + def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> { + bits<3> idx; + bits<3> idx2; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + let Inst{14-12} = idx2; + let Inst{11} = {?}; + } + def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> { + bits<2> idx; + bits<2> idx2; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + let Inst{14-13} = idx2; + let Inst{12-11} = {?,?}; + } + def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> { + bits<1> idx; + bits<1> idx2; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + let Inst{14} = idx2; + let Inst{13-11} = {?,?,?}; + } + + // For all forms of the INS instruction, the "mov" mnemonic is the + // preferred alias. Why they didn't just call the instruction "mov" in + // the first place is a very good question indeed... + def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"), + GPR32, VectorIndexB>; + def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"), + GPR32, VectorIndexH>; + def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"), + GPR32, VectorIndexS>; + def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"), + GPR64, VectorIndexD>; + + def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"), + VectorIndexB>; + def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"), + VectorIndexH>; + def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"), + VectorIndexS>; + def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"), + VectorIndexD>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD TBL/TBX +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype, + RegisterOperand listtype, string asm, string kind> + : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm, + "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>, + Sched<[WriteV]> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-21} = 0b001110000; + let Inst{20-16} = Vm; + let Inst{15} = 0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + let Inst{9-5} = Vn; + let Inst{4-0} = Vd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype, + RegisterOperand listtype, string asm, string kind> + : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm, + "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>, + Sched<[WriteV]> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-21} = 0b001110000; + let Inst{20-16} = Vm; + let Inst{15} = 0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + let Inst{9-5} = Vn; + let Inst{4-0} = Vd; +} + +class SIMDTableLookupAlias<string asm, Instruction inst, + RegisterOperand vectype, RegisterOperand listtype> + : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"), + (inst vectype:$dst, listtype:$lst, vectype:$index), 0>; + +multiclass SIMDTableLookup<bit op, string asm> { + def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b, + asm, ".8b">; + def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b, + asm, ".8b">; + def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b, + asm, ".8b">; + def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b, + asm, ".8b">; + def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b, + asm, ".16b">; + def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b, + asm, ".16b">; + def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b, + asm, ".16b">; + def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b, + asm, ".16b">; + + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8One"), + V64, VecListOne128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Two"), + V64, VecListTwo128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Three"), + V64, VecListThree128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Four"), + V64, VecListFour128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8One"), + V128, VecListOne128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Two"), + V128, VecListTwo128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Three"), + V128, VecListThree128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Four"), + V128, VecListFour128>; +} + +multiclass SIMDTableLookupTied<bit op, string asm> { + def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b, + asm, ".8b">; + def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b, + asm, ".8b">; + def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b, + asm, ".8b">; + def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b, + asm, ".8b">; + def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b, + asm, ".16b">; + def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b, + asm, ".16b">; + def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b, + asm, ".16b">; + def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b, + asm, ".16b">; + + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8One"), + V64, VecListOne128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Two"), + V64, VecListTwo128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Three"), + V64, VecListThree128>; + def : SIMDTableLookupAlias<asm # ".8b", + !cast<Instruction>(NAME#"v8i8Four"), + V64, VecListFour128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8One"), + V128, VecListOne128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Two"), + V128, VecListTwo128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Three"), + V128, VecListThree128>; + def : SIMDTableLookupAlias<asm # ".16b", + !cast<Instruction>(NAME#"v16i8Four"), + V128, VecListFour128>; +} + + +//---------------------------------------------------------------------------- +// AdvSIMD scalar CPY +//---------------------------------------------------------------------------- +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype, + string kind, Operand idxtype> + : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov", + "{\t$dst, $src" # kind # "$idx" # + "|\t$dst, $src$idx}", "", []>, + Sched<[WriteV]> { + bits<5> dst; + bits<5> src; + let Inst{31-21} = 0b01011110000; + let Inst{15-10} = 0b000001; + let Inst{9-5} = src; + let Inst{4-0} = dst; +} + +class SIMDScalarCPYAlias<string asm, string size, Instruction inst, + RegisterClass regtype, RegisterOperand vectype, Operand idxtype> + : InstAlias<asm # "{\t$dst, $src" # size # "$index" # + # "|\t$dst, $src$index}", + (inst regtype:$dst, vectype:$src, idxtype:$index), 0>; + + +multiclass SIMDScalarCPY<string asm> { + def i8 : BaseSIMDScalarCPY<FPR8, V128, ".b", VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + + def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src), + VectorIndexD:$idx)))), + (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>; + + // 'DUP' mnemonic aliases. + def : SIMDScalarCPYAlias<"dup", ".b", + !cast<Instruction>(NAME#"i8"), + FPR8, V128, VectorIndexB>; + def : SIMDScalarCPYAlias<"dup", ".h", + !cast<Instruction>(NAME#"i16"), + FPR16, V128, VectorIndexH>; + def : SIMDScalarCPYAlias<"dup", ".s", + !cast<Instruction>(NAME#"i32"), + FPR32, V128, VectorIndexS>; + def : SIMDScalarCPYAlias<"dup", ".d", + !cast<Instruction>(NAME#"i64"), + FPR64, V128, VectorIndexD>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD modified immediate instructions +//---------------------------------------------------------------------------- + +class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops, + string asm, string op_string, + string cstr, list<dag> pattern> + : I<oops, iops, asm, op_string, cstr, pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<8> imm8; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = op; + let Inst{28-19} = 0b0111100000; + let Inst{18-16} = imm8{7-5}; + let Inst{11} = op2; + let Inst{10} = 1; + let Inst{9-5} = imm8{4-0}; + let Inst{4-0} = Rd; +} + +class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype, + Operand immtype, dag opt_shift_iop, + string opt_shift, string asm, string kind, + list<dag> pattern> + : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd), + !con((ins immtype:$imm8), opt_shift_iop), asm, + "{\t$Rd" # kind # ", $imm8" # opt_shift # + "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", + "", pattern> { + let DecoderMethod = "DecodeModImmInstruction"; +} + +class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, + Operand immtype, dag opt_shift_iop, + string opt_shift, string asm, string kind, + list<dag> pattern> + : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst), + !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop), + asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # + "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", + "$Rd = $dst", pattern> { + let DecoderMethod = "DecodeModImmTiedInstruction"; +} + +class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12, + RegisterOperand vectype, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, + (ins logical_vec_shift:$shift), + "$shift", asm, kind, pattern> { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14-13} = shift; + let Inst{12} = b15_b12{0}; +} + +class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12, + RegisterOperand vectype, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255, + (ins logical_vec_shift:$shift), + "$shift", asm, kind, pattern> { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14-13} = shift; + let Inst{12} = b15_b12{0}; +} + + +class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12, + RegisterOperand vectype, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, + (ins logical_vec_hw_shift:$shift), + "$shift", asm, kind, pattern> { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14} = 0; + let Inst{13} = shift{0}; + let Inst{12} = b15_b12{0}; +} + +class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12, + RegisterOperand vectype, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255, + (ins logical_vec_hw_shift:$shift), + "$shift", asm, kind, pattern> { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14} = 0; + let Inst{13} = shift{0}; + let Inst{12} = b15_b12{0}; +} + +multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode, + string asm> { + def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64, + asm, ".4h", []>; + def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128, + asm, ".8h", []>; + + def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64, + asm, ".2s", []>; + def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128, + asm, ".4s", []>; +} + +multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode, + bits<2> w_cmode, string asm, + SDNode OpNode> { + def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64, + asm, ".4h", + [(set (v4i16 V64:$dst), (OpNode V64:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128, + asm, ".8h", + [(set (v8i16 V128:$dst), (OpNode V128:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + + def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64, + asm, ".2s", + [(set (v2i32 V64:$dst), (OpNode V64:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128, + asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode V128:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; +} + +class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, + RegisterOperand vectype, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, + (ins move_vec_shift:$shift), + "$shift", asm, kind, pattern> { + bits<1> shift; + let Inst{15-13} = cmode{3-1}; + let Inst{12} = shift; +} + +class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode, + RegisterOperand vectype, + Operand imm_type, string asm, + string kind, list<dag> pattern> + : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "", + asm, kind, pattern> { + let Inst{15-12} = cmode; +} + +class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm, + list<dag> pattern> + : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, + "\t$Rd, $imm8", "", pattern> { + let Inst{15-12} = cmode; + let DecoderMethod = "DecodeModImmInstruction"; +} + +//---------------------------------------------------------------------------- +// AdvSIMD indexed element +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, + RegisterOperand dst_reg, RegisterOperand lhs_reg, + RegisterOperand rhs_reg, Operand vec_idx, string asm, + string apple_kind, string dst_kind, string lhs_kind, + string rhs_kind, list<dag> pattern> + : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), + asm, + "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # + "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28} = Scalar; + let Inst{27-24} = 0b1111; + let Inst{23-22} = size; + // Bit 21 must be set by the derived class. + let Inst{20-16} = Rm; + let Inst{15-12} = opc; + // Bit 11 must be set by the derived class. + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, + RegisterOperand dst_reg, RegisterOperand lhs_reg, + RegisterOperand rhs_reg, Operand vec_idx, string asm, + string apple_kind, string dst_kind, string lhs_kind, + string rhs_kind, list<dag> pattern> + : I<(outs dst_reg:$dst), + (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, + "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # + "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28} = Scalar; + let Inst{27-24} = 0b1111; + let Inst{23-22} = size; + // Bit 21 must be set by the derived class. + let Inst{20-16} = Rm; + let Inst{15-12} = opc; + // Bit 11 must be set by the derived class. + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4f16 V64:$Rd), + (OpNode (v4f16 V64:$Rn), + (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8f16 V128:$Rd), + (OpNode (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2f32 V64:$Rd), + (OpNode (v2f32 V64:$Rn), + (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4f32 V128:$Rd), + (OpNode (v4f32 V128:$Rn), + (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc, + V128, V128, + V128, VectorIndexD, + asm, ".2d", ".2d", ".2d", ".d", + [(set (v2f64 V128:$Rd), + (OpNode (v2f64 V128:$Rn), + (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } + + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", + [(set (f16 FPR16Op:$Rd), + (OpNode (f16 FPR16Op:$Rn), + (f16 (vector_extract (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (f32 FPR32Op:$Rd), + (OpNode (f32 FPR32Op:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc, + FPR64Op, FPR64Op, V128, VectorIndexD, + asm, ".d", "", "", ".d", + [(set (f64 FPR64Op:$Rd), + (OpNode (f64 FPR64Op:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), + VectorIndexD:$idx))))]> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } +} + +multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexS:$idx))), + (!cast<Instruction>(INST # v2i32_indexed) + V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64dup (f32 FPR32Op:$Rm)))), + (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + + // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexS:$idx))), + (!cast<Instruction>(INST # "v4i32_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64dup (f32 FPR32Op:$Rm)))), + (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64duplane64 (v2f64 V128:$Rm), + VectorIndexD:$idx))), + (!cast<Instruction>(INST # "v2i64_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64dup (f64 FPR64Op:$Rm)))), + (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; + + // 2 variants for 32-bit scalar version: extract from .2s or from .4s + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), + (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, + V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), + (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; + + // 1 variant for 64-bit scalar version: extract from .1d or from .2d + def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), + (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))), + (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn, + V128:$Rm, VectorIndexD:$idx)>; +} + +multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc, + V128, V128, + V128, VectorIndexD, + asm, ".2d", ".2d", ".2d", ".d", []> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } + + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + + def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc, + FPR64Op, FPR64Op, V128, VectorIndexD, + asm, ".d", "", "", ".d", []> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } +} + +multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$Rd), + (OpNode (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$Rd), + (OpNode (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i32 FPR32Op:$Rd), + (OpNode FPR32Op:$Rn, + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$Rd), + (OpNode (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$Rd), + (OpNode (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, + FPR32Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR64Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm, + SDPatternOperator Accum> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull + (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an + // intermediate EXTRACT_SUBREG would be untyped. + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract (v4i32 + (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))), + (i64 0))))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME # v4i16_indexed) + (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn, + V128_lo:$Rm, VectorIndexH:$idx), + ssub)>; + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull + (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 + (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull + (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, + FPR32Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + + def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR64Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i64 FPR64Op:$dst), + (Accum (i64 FPR64Op:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar + (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + } +} + +multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), + (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), + (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + } +} + +//---------------------------------------------------------------------------- +// AdvSIMD scalar shift by immediate +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm, + RegisterClass regtype1, RegisterClass regtype2, + Operand immtype, string asm, list<dag> pattern> + : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<7> imm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-23} = 0b111110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm, + RegisterClass regtype1, RegisterClass regtype2, + Operand immtype, string asm, list<dag> pattern> + : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<7> imm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-23} = 0b111110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, + FPR32, FPR32, vecshiftR32, asm, []> { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftR64, asm, []> { + let Inst{21-16} = imm{5-0}; + } +} + +multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftR64, asm, + [(set (i64 FPR64:$Rd), + (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), + (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>; +} + +multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftR64, asm, + [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn), + (i32 vecshiftR64:$imm)))]> { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftR64:$imm))), + (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn, + vecshiftR64:$imm)>; +} + +multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftL64, asm, + [(set (v1i64 FPR64:$Rd), + (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> { + let Inst{21-16} = imm{5-0}; + } +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> { + def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftL64, asm, []> { + let Inst{21-16} = imm{5-0}; + } +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?}, + FPR8, FPR16, vecshiftR8, asm, []> { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR32, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, + FPR32, FPR64, vecshiftR32, asm, + [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> { + let Inst{20-16} = imm{4-0}; + } +} + +multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?}, + FPR8, FPR8, vecshiftL8, asm, []> { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftL16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, + FPR32, FPR32, vecshiftL32, asm, + [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftL64, asm, + [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), + (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; +} + +multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> { + def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?}, + FPR8, FPR8, vecshiftR8, asm, []> { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, + FPR32, FPR32, vecshiftR32, asm, []> { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?}, + FPR64, FPR64, vecshiftR64, asm, []> { + let Inst{21-16} = imm{5-0}; + } +} + +//---------------------------------------------------------------------------- +// AdvSIMD vector x indexed element +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm, + RegisterOperand dst_reg, RegisterOperand src_reg, + Operand immtype, + string asm, string dst_kind, string src_kind, + list<dag> pattern> + : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm), + asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # + "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-23} = 0b011110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm, + RegisterOperand vectype1, RegisterOperand vectype2, + Operand immtype, + string asm, string dst_kind, string src_kind, + list<dag> pattern> + : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm), + asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # + "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-23} = 0b011110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, + Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm, + Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V128, vecshiftR16Narrow, + asm, ".8b", ".8h", + [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR16Narrow, + asm#"2", ".16b", ".8h", []> { + bits<3> imm; + let Inst{18-16} = imm; + let hasSideEffects = 0; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V128, vecshiftR32Narrow, + asm, ".4h", ".4s", + [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR32Narrow, + asm#"2", ".8h", ".4s", []> { + bits<4> imm; + let Inst{19-16} = imm; + let hasSideEffects = 0; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V128, vecshiftR64Narrow, + asm, ".2s", ".2d", + [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR64Narrow, + asm#"2", ".4s", ".2d", []> { + bits<5> imm; + let Inst{20-16} = imm; + let hasSideEffects = 0; + } + + // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions + // themselves, so put them here instead. + + // Patterns involving what's effectively an insert high and a normal + // intrinsic, represented by CONCAT_VECTORS. + def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn), + vecshiftR16Narrow:$imm)), + (!cast<Instruction>(NAME # "v16i8_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR16Narrow:$imm)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), + vecshiftR32Narrow:$imm)), + (!cast<Instruction>(NAME # "v8i16_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), + vecshiftR64Narrow:$imm)), + (!cast<Instruction>(NAME # "v4i32_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR64Narrow:$imm)>; +} + +multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftL8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftL16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftL32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftL64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), + (i32 vecshiftL64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftR8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), + (i32 vecshiftR64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftR8, asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR8, asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), + (i32 vecshiftR64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftL8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftL16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftL32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftL64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), + (i32 vecshiftL64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V128, V64, vecshiftL8, asm, ".8h", ".8b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm#"2", ".8h", ".16b", + [(set (v8i16 V128:$Rd), + (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V128, V64, vecshiftL16, asm, ".4s", ".4h", + [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm#"2", ".4s", ".8h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { + + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V128, V64, vecshiftL32, asm, ".2d", ".2s", + [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm#"2", ".2d", ".4s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } +} + + +//--- +// Vector load/store +//--- +// SIMD ldX/stX no-index memory references don't allow the optional +// ", #0" constant and handle post-indexing explicitly, so we use +// a more specialized parse method for them. Otherwise, it's the same as +// the general GPR64sp handling. + +class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size, + string asm, dag oops, dag iops, list<dag> pattern> + : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-23} = 0b0011000; + let Inst{22} = L; + let Inst{21-16} = 0b000000; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size, + string asm, dag oops, dag iops> + : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> { + bits<5> Vt; + bits<5> Rn; + bits<5> Xm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-23} = 0b0011001; + let Inst{22} = L; + let Inst{21} = 0; + let Inst{20-16} = Xm; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +// The immediate form of AdvSIMD post-indexed addressing is encoded with +// register post-index addressing from the zero register. +multiclass SIMDLdStAliases<string asm, string layout, string Count, + int Offset, int Size> { + // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16" + // "ld1\t$Vt, [$Rn], #16" + // may get mapped to + // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # layout):$Vt, + XZR), 1>; + + // E.g. "ld1.8b { v0, v1 }, [x1], #16" + // "ld1.8b\t$Vt, [$Rn], #16" + // may get mapped to + // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + XZR), 0>; + + // E.g. "ld1.8b { v0, v1 }, [x1]" + // "ld1\t$Vt, [$Rn]" + // may get mapped to + // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]", + (!cast<Instruction>(NAME # Count # "v" # layout) + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + GPR64sp:$Rn), 0>; + + // E.g. "ld1.8b { v0, v1 }, [x1], x2" + // "ld1\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm", + (!cast<Instruction>(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128, + int Offset64, bits<4> opcode> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm, + (outs !cast<RegisterOperand>(veclist # "16b"):$Vt), + (ins GPR64sp:$Rn), []>; + def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm, + (outs !cast<RegisterOperand>(veclist # "8h"):$Vt), + (ins GPR64sp:$Rn), []>; + def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm, + (outs !cast<RegisterOperand>(veclist # "4s"):$Vt), + (ins GPR64sp:$Rn), []>; + def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm, + (outs !cast<RegisterOperand>(veclist # "2d"):$Vt), + (ins GPR64sp:$Rn), []>; + def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm, + (outs !cast<RegisterOperand>(veclist # "8b"):$Vt), + (ins GPR64sp:$Rn), []>; + def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm, + (outs !cast<RegisterOperand>(veclist # "4h"):$Vt), + (ins GPR64sp:$Rn), []>; + def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm, + (outs !cast<RegisterOperand>(veclist # "2s"):$Vt), + (ins GPR64sp:$Rn), []>; + + + def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "16b"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "8h"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "4s"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "2d"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "8b"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "4h"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "2s"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>; + defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>; + defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>; +} + +// Only ld1/st1 has a v1d version. +multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128, + int Offset64, bits<4> opcode> { + let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in { + def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "16b"):$Vt, + GPR64sp:$Rn), []>; + def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "8h"):$Vt, + GPR64sp:$Rn), []>; + def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "4s"):$Vt, + GPR64sp:$Rn), []>; + def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "2d"):$Vt, + GPR64sp:$Rn), []>; + def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "8b"):$Vt, + GPR64sp:$Rn), []>; + def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "4h"):$Vt, + GPR64sp:$Rn), []>; + def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "2s"):$Vt, + GPR64sp:$Rn), []>; + + def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "16b"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "8h"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "4s"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "2d"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>; + def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "8b"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "4h"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "2s"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>; + defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>; + defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>; + defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>; +} + +multiclass BaseSIMDLd1<string Count, string asm, string veclist, + int Offset128, int Offset64, bits<4> opcode> + : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> { + + // LD1 instructions have extra "1d" variants. + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm, + (outs !cast<RegisterOperand>(veclist # "1d"):$Vt), + (ins GPR64sp:$Rn), []>; + + def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm, + (outs GPR64sp:$wback, + !cast<RegisterOperand>(veclist # "1d"):$Vt), + (ins GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>; +} + +multiclass BaseSIMDSt1<string Count, string asm, string veclist, + int Offset128, int Offset64, bits<4> opcode> + : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> { + + // ST1 instructions have extra "1d" variants. + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs), + (ins !cast<RegisterOperand>(veclist # "1d"):$Vt, + GPR64sp:$Rn), []>; + + def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, + (outs GPR64sp:$wback), + (ins !cast<RegisterOperand>(veclist # "1d"):$Vt, + GPR64sp:$Rn, + !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>; +} + +multiclass SIMDLd1Multiple<string asm> { + defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>; + defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; + defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>; + defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>; +} + +multiclass SIMDSt1Multiple<string asm> { + defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>; + defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; + defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>; + defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>; +} + +multiclass SIMDLd2Multiple<string asm> { + defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; +} + +multiclass SIMDSt2Multiple<string asm> { + defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; +} + +multiclass SIMDLd3Multiple<string asm> { + defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>; +} + +multiclass SIMDSt3Multiple<string asm> { + defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>; +} + +multiclass SIMDLd4Multiple<string asm> { + defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>; +} + +multiclass SIMDSt4Multiple<string asm> { + defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>; +} + +//--- +// AdvSIMD Load/store single-element +//--- + +class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode, + string asm, string operands, string cst, + dag oops, dag iops, list<dag> pattern> + : I<oops, iops, asm, operands, cst, pattern> { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{29-24} = 0b001101; + let Inst{22} = L; + let Inst{21} = R; + let Inst{15-13} = opcode; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode, + string asm, string operands, string cst, + dag oops, dag iops, list<dag> pattern> + : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{29-24} = 0b001101; + let Inst{22} = L; + let Inst{21} = R; + let Inst{15-13} = opcode; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm, + Operand listtype> + : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "", + (outs listtype:$Vt), (ins GPR64sp:$Rn), + []> { + let Inst{30} = Q; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = S; + let Inst{11-10} = size; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, + string asm, Operand listtype, Operand GPR64pi> + : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm", + "$Rn = $wback", + (outs GPR64sp:$wback, listtype:$Vt), + (ins GPR64sp:$Rn, GPR64pi:$Xm), []> { + bits<5> Xm; + let Inst{30} = Q; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = S; + let Inst{11-10} = size; +} + +multiclass SIMDLdrAliases<string asm, string layout, string Count, + int Offset, int Size> { + // E.g. "ld1r { v0.8b }, [x1], #1" + // "ld1r.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # layout):$Vt, + XZR), 1>; + + // E.g. "ld1r.8b { v0 }, [x1], #1" + // "ld1r.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + XZR), 0>; + + // E.g. "ld1r.8b { v0 }, [x1]" + // "ld1r.8b\t$Vt, [$Rn]" + // may get mapped to + // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]", + (!cast<Instruction>(NAME # "v" # layout) + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + GPR64sp:$Rn), 0>; + + // E.g. "ld1r.8b { v0 }, [x1], x2" + // "ld1r.8b\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) + def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm", + (!cast<Instruction>(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # Size):$Vt, + !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count, + int Offset1, int Offset2, int Offset4, int Offset8> { + def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm, + !cast<Operand>("VecList" # Count # "8b")>; + def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm, + !cast<Operand>("VecList" # Count #"16b")>; + def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm, + !cast<Operand>("VecList" # Count #"4h")>; + def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm, + !cast<Operand>("VecList" # Count #"8h")>; + def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm, + !cast<Operand>("VecList" # Count #"2s")>; + def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm, + !cast<Operand>("VecList" # Count #"4s")>; + def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm, + !cast<Operand>("VecList" # Count #"1d")>; + def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm, + !cast<Operand>("VecList" # Count #"2d")>; + + def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm, + !cast<Operand>("VecList" # Count # "8b"), + !cast<Operand>("GPR64pi" # Offset1)>; + def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm, + !cast<Operand>("VecList" # Count # "16b"), + !cast<Operand>("GPR64pi" # Offset1)>; + def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm, + !cast<Operand>("VecList" # Count # "4h"), + !cast<Operand>("GPR64pi" # Offset2)>; + def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm, + !cast<Operand>("VecList" # Count # "8h"), + !cast<Operand>("GPR64pi" # Offset2)>; + def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm, + !cast<Operand>("VecList" # Count # "2s"), + !cast<Operand>("GPR64pi" # Offset4)>; + def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm, + !cast<Operand>("VecList" # Count # "4s"), + !cast<Operand>("GPR64pi" # Offset4)>; + def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm, + !cast<Operand>("VecList" # Count # "1d"), + !cast<Operand>("GPR64pi" # Offset8)>; + def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm, + !cast<Operand>("VecList" # Count # "2d"), + !cast<Operand>("GPR64pi" # Offset8)>; + + defm : SIMDLdrAliases<asm, "8b", Count, Offset1, 64>; + defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>; + defm : SIMDLdrAliases<asm, "4h", Count, Offset2, 64>; + defm : SIMDLdrAliases<asm, "8h", Count, Offset2, 128>; + defm : SIMDLdrAliases<asm, "2s", Count, Offset4, 64>; + defm : SIMDLdrAliases<asm, "4s", Count, Offset4, 128>; + defm : SIMDLdrAliases<asm, "1d", Count, Offset8, 64>; + defm : SIMDLdrAliases<asm, "2d", Count, Offset8, 128>; +} + +class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops, + pattern> { + // idx encoded in Q:S:size fields. + bits<4> idx; + let Inst{30} = idx{3}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", + oops, iops, pattern> { + // idx encoded in Q:S:size fields. + bits<4> idx; + let Inst{30} = idx{3}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S:size fields. + bits<4> idx; + bits<5> Xm; + let Inst{30} = idx{3}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S:size fields. + bits<4> idx; + bits<5> Xm; + let Inst{30} = idx{3}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} + +class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops, + pattern> { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + let Inst{30} = idx{2}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", + oops, iops, pattern> { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + let Inst{30} = idx{2}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} + +class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + bits<5> Xm; + let Inst{30} = idx{2}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + bits<5> Xm; + let Inst{30} = idx{2}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops, + pattern> { + // idx encoded in Q:S fields. + bits<2> idx; + let Inst{30} = idx{1}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", + oops, iops, pattern> { + // idx encoded in Q:S fields. + bits<2> idx; + let Inst{30} = idx{1}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S fields. + bits<2> idx; + bits<5> Xm; + let Inst{30} = idx{1}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q:S fields. + bits<2> idx; + bits<5> Xm; + let Inst{30} = idx{1}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops, + pattern> { + // idx encoded in Q field. + bits<1> idx; + let Inst{30} = idx; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm, + dag oops, dag iops, list<dag> pattern> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", + oops, iops, pattern> { + // idx encoded in Q field. + bits<1> idx; + let Inst{30} = idx; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q field. + bits<1> idx; + bits<5> Xm; + let Inst{30} = idx; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm", + "$Rn = $wback", oops, iops, []> { + // idx encoded in Q field. + bits<1> idx; + bits<5> Xm; + let Inst{30} = idx; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = 0; + let Inst{11-10} = size; +} + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i8 : SIMDLdStSingleBTied<1, R, opcode, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn), []>; + + def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn), []>; + + def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn), []>; + + def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn), []>; + + def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i8 : SIMDLdStSingleB<0, R, opcode, asm, + (outs), (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn), []>; + + def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i16 : SIMDLdStSingleH<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn), []>; + + def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i32 : SIMDLdStSingleS<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn), []>; + + def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i64 : SIMDLdStSingleD<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn), []>; + + def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} + +multiclass SIMDLdStSingleAliases<string asm, string layout, string Type, + string Count, int Offset, Operand idxtype> { + // E.g. "ld1 { v0.8b }[0], [x1], #1" + // "ld1\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # layout):$Vt, + idxtype:$idx, XZR), 1>; + + // E.g. "ld1.8b { v0 }[0], [x1], #1" + // "ld1.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset, + (!cast<Instruction>(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # "128"):$Vt, + idxtype:$idx, XZR), 0>; + + // E.g. "ld1.8b { v0 }[0], [x1]" + // "ld1.8b\t$Vt, [$Rn]" + // may get mapped to + // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) + def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]", + (!cast<Instruction>(NAME # Type) + !cast<RegisterOperand>("VecList" # Count # "128"):$Vt, + idxtype:$idx, GPR64sp:$Rn), 0>; + + // E.g. "ld1.8b { v0 }[0], [x1], x2" + // "ld1.8b\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) + def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm", + (!cast<Instruction>(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast<RegisterOperand>("VecList" # Count # "128"):$Vt, + idxtype:$idx, + !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass SIMDLdSt1SingleAliases<string asm> { + defm : SIMDLdStSingleAliases<asm, "b", "i8", "One", 1, VectorIndexB>; + defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>; + defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>; + defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>; +} + +multiclass SIMDLdSt2SingleAliases<string asm> { + defm : SIMDLdStSingleAliases<asm, "b", "i8", "Two", 2, VectorIndexB>; + defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4, VectorIndexH>; + defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8, VectorIndexS>; + defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>; +} + +multiclass SIMDLdSt3SingleAliases<string asm> { + defm : SIMDLdStSingleAliases<asm, "b", "i8", "Three", 3, VectorIndexB>; + defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6, VectorIndexH>; + defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>; + defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>; +} + +multiclass SIMDLdSt4SingleAliases<string asm> { + defm : SIMDLdStSingleAliases<asm, "b", "i8", "Four", 4, VectorIndexB>; + defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8, VectorIndexH>; + defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>; + defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>; +} +} // end of 'let Predicates = [HasNEON]' + +//---------------------------------------------------------------------------- +// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract +//---------------------------------------------------------------------------- + +let Predicates = [HasNEON, HasV8_1a] in { + +class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, + RegisterOperand regtype, string asm, + string kind, list<dag> pattern> + : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, + pattern> { +} +multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm, + SDPatternOperator Accum> { + def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", + [(set (v4i16 V64:$dst), + (Accum (v4i16 V64:$Rd), + (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), + (v4i16 V64:$Rm)))))]>; + def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", + [(set (v8i16 V128:$dst), + (Accum (v8i16 V128:$Rd), + (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), + (v8i16 V128:$Rm)))))]>; + def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", + [(set (v2i32 V64:$dst), + (Accum (v2i32 V64:$Rd), + (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), + (v2i32 V64:$Rm)))))]>; + def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), + (v4i32 V128:$Rm)))))]>; +} + +multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm, + SDPatternOperator Accum> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V64, V64, V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$dst), + (Accum (v4i16 V64:$Rd), + (v4i16 (int_aarch64_neon_sqrdmulh + (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$dst), + (Accum (v8i16 V128:$Rd), + (v8i16 (int_aarch64_neon_sqrdmulh + (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V64, V64, V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$dst), + (Accum (v2i32 V64:$Rd), + (v2i32 (int_aarch64_neon_sqrdmulh + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but + // an intermediate EXTRACT_SUBREG would be untyped. + // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we + // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (insert_subvector + (undef), + (v2i32 (int_aarch64_neon_sqrdmulh + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 + (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (i32 0))), + (i64 0))))), + (EXTRACT_SUBREG + (v2i32 (!cast<Instruction>(NAME # v2i32_indexed) + (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + FPR32Op:$Rd, + ssub)), + V64:$Rn, + V128:$Rm, + VectorIndexS:$idx)), + ssub)>; + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqrdmulh + (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but + // an intermediate EXTRACT_SUBREG would be untyped. + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract + (v4i32 (int_aarch64_neon_sqrdmulh + (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 + (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (i64 0))))), + (EXTRACT_SUBREG + (v4i32 (!cast<Instruction>(NAME # v4i32_indexed) + (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + FPR32Op:$Rd, + ssub)), + V128:$Rn, + V128:$Rm, + VectorIndexS:$idx)), + ssub)>; + + def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, + FPR16Op, FPR16Op, V128_lo, + VectorIndexH, asm, ".h", "", "", ".h", + []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i32 FPR32Op:$dst), + (Accum (i32 FPR32Op:$Rd), + (i32 (int_aarch64_neon_sqrdmulh + (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} +} // let Predicates = [HasNeon, HasV8_1a] + +//---------------------------------------------------------------------------- +// Crypto extensions +//---------------------------------------------------------------------------- + +let Predicates = [HasCrypto] in { +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr, + list<dag> pat> + : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + let Inst{31-16} = 0b0100111000101000; + let Inst{15-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class AESInst<bits<4> opc, string asm, Intrinsic OpNode> + : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + +class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode> + : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn), + "$Rd = $dst", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind, + dag oops, dag iops, list<dag> pat> + : I<oops, iops, asm, + "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" # + "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-21} = 0b01011110000; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst), + (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm), + [(set (v4i32 FPR128:$dst), + (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn), + (v4i32 V128:$Rm)))]>; + +class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst), + (ins V128:$Rd, V128:$Rn, V128:$Rm), + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 V128:$Rm)))]>; + +class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst), + (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm), + [(set (v4i32 FPR128:$dst), + (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn), + (v4i32 V128:$Rm)))]>; + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class SHA2OpInst<bits<4> opc, string asm, string kind, + string cstr, dag oops, dag iops, + list<dag> pat> + : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind # + "|" # kind # "\t$Rd, $Rn}", cstr, pat>, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + let Inst{31-16} = 0b0101111000101000; + let Inst{15-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode> + : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst), + (ins V128:$Rd, V128:$Rn), + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; + +class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode> + : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn), + [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>; +} // end of 'let Predicates = [HasCrypto]' + +//---------------------------------------------------------------------------- +// v8.1 atomic instructions extension: +// * CAS +// * CASP +// * SWP +// * LDOPregister<OP>, and aliases STOPregister<OP> + +// Instruction encodings: +// +// 31 30|29 24|23|22|21|20 16|15|14 10|9 5|4 0 +// CAS SZ |001000|1 |A |1 |Rs |R |11111 |Rn |Rt +// CASP 0|SZ|001000|0 |A |1 |Rs |R |11111 |Rn |Rt +// SWP SZ |111000|A |R |1 |Rs |1 |OPC|00|Rn |Rt +// LD SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |Rt +// ST SZ |111000|A |R |1 |Rs |0 |OPC|00|Rn |11111 + +// Instruction syntax: +// +// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>] +// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>] +// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>] +// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>] +// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>] +// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>] +// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>] +// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>] +// ST<OP>{<order>}[<size>] <Ws>, [<Xn|SP>] +// ST<OP>{<order>} <Xs>, [<Xn|SP>] + +let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in +class BaseCASEncoding<dag oops, dag iops, string asm, string operands, + string cstr, list<dag> pattern> + : I<oops, iops, asm, operands, cstr, pattern> { + bits<2> Sz; + bit NP; + bit Acq; + bit Rel; + bits<5> Rs; + bits<5> Rn; + bits<5> Rt; + let Inst{31-30} = Sz; + let Inst{29-24} = 0b001000; + let Inst{23} = NP; + let Inst{22} = Acq; + let Inst{21} = 0b1; + let Inst{20-16} = Rs; + let Inst{15} = Rel; + let Inst{14-10} = 0b11111; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +class BaseCAS<string order, string size, RegisterClass RC> + : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), + "cas" # order # size, "\t$Rs, $Rt, [$Rn]", + "$out = $Rs",[]> { + let NP = 1; +} + +multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> { + let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseCAS<order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseCAS<order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseCAS<order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseCAS<order, "", GPR64>; +} + +class BaseCASP<string order, string size, RegisterOperand RC> + : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn), + "casp" # order # size, "\t$Rs, $Rt, [$Rn]", + "$out = $Rs",[]> { + let NP = 0; +} + +multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> { + let Sz = 0b00, Acq = Acq, Rel = Rel in + def s : BaseCASP<order, "", WSeqPairClassOperand>; + let Sz = 0b01, Acq = Acq, Rel = Rel in + def d : BaseCASP<order, "", XSeqPairClassOperand>; +} + +let Predicates = [HasV8_1a] in +class BaseSWP<string order, string size, RegisterClass RC> + : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size, + "\t$Rs, $Rt, [$Rn]","",[]> { + bits<2> Sz; + bit Acq; + bit Rel; + bits<5> Rs; + bits<3> opc = 0b000; + bits<5> Rn; + bits<5> Rt; + let Inst{31-30} = Sz; + let Inst{29-24} = 0b111000; + let Inst{23} = Acq; + let Inst{22} = Rel; + let Inst{21} = 0b1; + let Inst{20-16} = Rs; + let Inst{15} = 0b1; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Swap<bits<1> Acq, bits<1> Rel, string order> { + let Sz = 0b00, Acq = Acq, Rel = Rel in def b : BaseSWP<order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel in def h : BaseSWP<order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel in def s : BaseSWP<order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel in def d : BaseSWP<order, "", GPR64>; +} + +let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in +class BaseLDOPregister<string op, string order, string size, RegisterClass RC> + : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size, + "\t$Rs, $Rt, [$Rn]","",[]> { + bits<2> Sz; + bit Acq; + bit Rel; + bits<5> Rs; + bits<3> opc; + bits<5> Rn; + bits<5> Rt; + let Inst{31-30} = Sz; + let Inst{29-24} = 0b111000; + let Inst{23} = Acq; + let Inst{22} = Rel; + let Inst{21} = 0b1; + let Inst{20-16} = Rs; + let Inst{15} = 0b0; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass LDOPregister<bits<3> opc, string op, bits<1> Acq, bits<1> Rel, + string order> { + let Sz = 0b00, Acq = Acq, Rel = Rel, opc = opc in + def b : BaseLDOPregister<op, order, "b", GPR32>; + let Sz = 0b01, Acq = Acq, Rel = Rel, opc = opc in + def h : BaseLDOPregister<op, order, "h", GPR32>; + let Sz = 0b10, Acq = Acq, Rel = Rel, opc = opc in + def s : BaseLDOPregister<op, order, "", GPR32>; + let Sz = 0b11, Acq = Acq, Rel = Rel, opc = opc in + def d : BaseLDOPregister<op, order, "", GPR64>; +} + +let Predicates = [HasV8_1a] in +class BaseSTOPregister<string asm, RegisterClass OP, Register Reg, + Instruction inst> : + InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>; + +multiclass STOPregister<string asm, string instr> { + def : BaseSTOPregister<asm # "lb", GPR32, WZR, + !cast<Instruction>(instr # "Lb")>; + def : BaseSTOPregister<asm # "lh", GPR32, WZR, + !cast<Instruction>(instr # "Lh")>; + def : BaseSTOPregister<asm # "l", GPR32, WZR, + !cast<Instruction>(instr # "Ls")>; + def : BaseSTOPregister<asm # "l", GPR64, XZR, + !cast<Instruction>(instr # "Ld")>; + def : BaseSTOPregister<asm # "b", GPR32, WZR, + !cast<Instruction>(instr # "b")>; + def : BaseSTOPregister<asm # "h", GPR32, WZR, + !cast<Instruction>(instr # "h")>; + def : BaseSTOPregister<asm, GPR32, WZR, + !cast<Instruction>(instr # "s")>; + def : BaseSTOPregister<asm, GPR64, XZR, + !cast<Instruction>(instr # "d")>; +} + +//---------------------------------------------------------------------------- +// Allow the size specifier tokens to be upper case, not just lower. +def : TokenAlias<".8B", ".8b">; +def : TokenAlias<".4H", ".4h">; +def : TokenAlias<".2S", ".2s">; +def : TokenAlias<".1D", ".1d">; +def : TokenAlias<".16B", ".16b">; +def : TokenAlias<".8H", ".8h">; +def : TokenAlias<".4S", ".4s">; +def : TokenAlias<".2D", ".2d">; +def : TokenAlias<".1Q", ".1q">; +def : TokenAlias<".2H", ".2h">; +def : TokenAlias<".B", ".b">; +def : TokenAlias<".H", ".h">; +def : TokenAlias<".S", ".s">; +def : TokenAlias<".D", ".d">; +def : TokenAlias<".Q", ".q">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp new file mode 100644 index 0000000..3ef3c8b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -0,0 +1,3015 @@ +//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#include "AArch64GenInstrInfo.inc" + +AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) + : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), + RI(STI.getTargetTriple()), Subtarget(STI) {} + +/// GetInstSize - Return the number of bytes of code the specified +/// instruction may be. This returns the maximum number of bytes. +unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + if (MI->getOpcode() == AArch64::INLINEASM) + return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + + const MCInstrDesc &Desc = MI->getDesc(); + switch (Desc.getOpcode()) { + default: + // Anything not explicitly designated otherwise is a nomal 4-byte insn. + return 4; + case TargetOpcode::DBG_VALUE: + case TargetOpcode::EH_LABEL: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + return 0; + } + + llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size"); +} + +static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, + SmallVectorImpl<MachineOperand> &Cond) { + // Block ends with fall-through condbranch. + switch (LastInst->getOpcode()) { + default: + llvm_unreachable("Unknown branch instruction?"); + case AArch64::Bcc: + Target = LastInst->getOperand(1).getMBB(); + Cond.push_back(LastInst->getOperand(0)); + break; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + Target = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(-1)); + Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); + Cond.push_back(LastInst->getOperand(0)); + break; + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + Target = LastInst->getOperand(2).getMBB(); + Cond.push_back(MachineOperand::CreateImm(-1)); + Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + } +} + +// Branch analysis. +bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return false; + + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + parseCondBranch(LastInst, TBB, Cond); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); + } + } + } + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with a B and a Bcc, handle it. + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + parseCondBranch(SecondLastInst, TBB, Cond); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // ...likewise if it ends with an indirect branch followed by an unconditional + // branch. + if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + +bool AArch64InstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + if (Cond[0].getImm() != -1) { + // Regular Bcc + AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); + } else { + // Folded compare-and-branch + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown conditional branch!"); + case AArch64::CBZW: + Cond[1].setImm(AArch64::CBNZW); + break; + case AArch64::CBNZW: + Cond[1].setImm(AArch64::CBZW); + break; + case AArch64::CBZX: + Cond[1].setImm(AArch64::CBNZX); + break; + case AArch64::CBNZX: + Cond[1].setImm(AArch64::CBZX); + break; + case AArch64::TBZW: + Cond[1].setImm(AArch64::TBNZW); + break; + case AArch64::TBNZW: + Cond[1].setImm(AArch64::TBZW); + break; + case AArch64::TBZX: + Cond[1].setImm(AArch64::TBNZX); + break; + case AArch64::TBNZX: + Cond[1].setImm(AArch64::TBZX); + break; + } + } + + return false; +} + +unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return 0; + + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) + return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +void AArch64InstrInfo::instantiateCondBranch( + MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, + ArrayRef<MachineOperand> Cond) const { + if (Cond[0].getImm() != -1) { + // Regular Bcc + BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); + } else { + // Folded compare-and-branch + // Note that we use addOperand instead of addReg to keep the flags. + const MachineInstrBuilder MIB = + BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]); + if (Cond.size() > 3) + MIB.addImm(Cond[3].getImm()); + MIB.addMBB(TBB); + } +} + +unsigned AArch64InstrInfo::InsertBranch( + MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (!FBB) { + if (Cond.empty()) // Unconditional branch? + BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); + else + instantiateCondBranch(MBB, DL, TBB, Cond); + return 1; + } + + // Two-way conditional branch. + instantiateCondBranch(MBB, DL, TBB, Cond); + BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); + return 2; +} + +// Find the original register that VReg is copied from. +static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { + while (TargetRegisterInfo::isVirtualRegister(VReg)) { + const MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (!DefMI->isFullCopy()) + return VReg; + VReg = DefMI->getOperand(1).getReg(); + } + return VReg; +} + +// Determine if VReg is defined by an instruction that can be folded into a +// csel instruction. If so, return the folded opcode, and the replacement +// register. +static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, + unsigned *NewVReg = nullptr) { + VReg = removeCopies(MRI, VReg); + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + return 0; + + bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); + const MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned Opc = 0; + unsigned SrcOpNum = 0; + switch (DefMI->getOpcode()) { + case AArch64::ADDSXri: + case AArch64::ADDSWri: + // if NZCV is used, do not fold. + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return 0; + // fall-through to ADDXri and ADDWri. + case AArch64::ADDXri: + case AArch64::ADDWri: + // add x, 1 -> csinc. + if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || + DefMI->getOperand(3).getImm() != 0) + return 0; + SrcOpNum = 1; + Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; + break; + + case AArch64::ORNXrr: + case AArch64::ORNWrr: { + // not x -> csinv, represented as orn dst, xzr, src. + unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); + if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) + return 0; + SrcOpNum = 2; + Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; + break; + } + + case AArch64::SUBSXrr: + case AArch64::SUBSWrr: + // if NZCV is used, do not fold. + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return 0; + // fall-through to SUBXrr and SUBWrr. + case AArch64::SUBXrr: + case AArch64::SUBWrr: { + // neg x -> csneg, represented as sub dst, xzr, src. + unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); + if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) + return 0; + SrcOpNum = 2; + Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; + break; + } + default: + return 0; + } + assert(Opc && SrcOpNum && "Missing parameters"); + + if (NewVReg) + *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); + return Opc; +} + +bool AArch64InstrInfo::canInsertSelect( + const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, + int &FalseCycles) const { + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // Expanding cbz/tbz requires an extra cycle of latency on the condition. + unsigned ExtraCondLat = Cond.size() != 1; + + // GPRs are handled by csel. + // FIXME: Fold in x+1, -x, and ~x when applicable. + if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || + AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + // Single-cycle csel, csinc, csinv, and csneg. + CondCycles = 1 + ExtraCondLat; + TrueCycles = FalseCycles = 1; + if (canFoldIntoCSel(MRI, TrueReg)) + TrueCycles = 0; + else if (canFoldIntoCSel(MRI, FalseReg)) + FalseCycles = 0; + return true; + } + + // Scalar floating point is handled by fcsel. + // FIXME: Form fabs, fmin, and fmax when applicable. + if (AArch64::FPR64RegClass.hasSubClassEq(RC) || + AArch64::FPR32RegClass.hasSubClassEq(RC)) { + CondCycles = 5 + ExtraCondLat; + TrueCycles = FalseCycles = 2; + return true; + } + + // Can't do vectors. + return false; +} + +void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // Parse the condition code, see parseCondBranch() above. + AArch64CC::CondCode CC; + switch (Cond.size()) { + default: + llvm_unreachable("Unknown condition opcode in Cond"); + case 1: // b.cc + CC = AArch64CC::CondCode(Cond[0].getImm()); + break; + case 3: { // cbz/cbnz + // We must insert a compare against 0. + bool Is64Bit; + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown branch opcode in Cond"); + case AArch64::CBZW: + Is64Bit = 0; + CC = AArch64CC::EQ; + break; + case AArch64::CBZX: + Is64Bit = 1; + CC = AArch64CC::EQ; + break; + case AArch64::CBNZW: + Is64Bit = 0; + CC = AArch64CC::NE; + break; + case AArch64::CBNZX: + Is64Bit = 1; + CC = AArch64CC::NE; + break; + } + unsigned SrcReg = Cond[2].getReg(); + if (Is64Bit) { + // cmp reg, #0 is actually subs xzr, reg, #0. + MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); + BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + } else { + MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); + BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + } + break; + } + case 4: { // tbz/tbnz + // We must insert a tst instruction. + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown branch opcode in Cond"); + case AArch64::TBZW: + case AArch64::TBZX: + CC = AArch64CC::EQ; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + CC = AArch64CC::NE; + break; + } + // cmp reg, #foo is actually ands xzr, reg, #1<<foo. + if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) + BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) + .addReg(Cond[2].getReg()) + .addImm( + AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); + else + BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) + .addReg(Cond[2].getReg()) + .addImm( + AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); + break; + } + } + + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + bool TryFold = false; + if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { + RC = &AArch64::GPR64RegClass; + Opc = AArch64::CSELXr; + TryFold = true; + } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { + RC = &AArch64::GPR32RegClass; + Opc = AArch64::CSELWr; + TryFold = true; + } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { + RC = &AArch64::FPR64RegClass; + Opc = AArch64::FCSELDrrr; + } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { + RC = &AArch64::FPR32RegClass; + Opc = AArch64::FCSELSrrr; + } + assert(RC && "Unsupported regclass"); + + // Try folding simple instructions into the csel. + if (TryFold) { + unsigned NewVReg = 0; + unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); + if (FoldedOpc) { + // The folded opcodes csinc, csinc and csneg apply the operation to + // FalseReg, so we need to invert the condition. + CC = AArch64CC::getInvertedCondCode(CC); + TrueReg = FalseReg; + } else + FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); + + // Fold the operation. Leave any dead instructions for DCE to clean up. + if (FoldedOpc) { + FalseReg = NewVReg; + Opc = FoldedOpc; + // The extends the live range of NewVReg. + MRI.clearKillFlags(NewVReg); + } + } + + // Pull all virtual register into the appropriate class. + MRI.constrainRegClass(TrueReg, RC); + MRI.constrainRegClass(FalseReg, RC); + + // Insert the csel. + BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm( + CC); +} + +/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. +static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { + uint64_t Imm = MI->getOperand(1).getImm(); + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); +} + +// FIXME: this implementation should be micro-architecture dependent, so a +// micro-architecture target hook should be introduced here in future. +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { + if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53()) + return MI->isAsCheapAsAMove(); + + switch (MI->getOpcode()) { + default: + return false; + + // add/sub on register without shift + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return (MI->getOperand(3).getImm() == 0); + + // logical ops on immediate + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + return true; + + // logical ops on register without shift + case AArch64::ANDWrr: + case AArch64::ANDXrr: + case AArch64::BICWrr: + case AArch64::BICXrr: + case AArch64::EONWrr: + case AArch64::EONXrr: + case AArch64::EORWrr: + case AArch64::EORXrr: + case AArch64::ORNWrr: + case AArch64::ORNXrr: + case AArch64::ORRWrr: + case AArch64::ORRXrr: + return true; + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or + // ORRXri, it is as cheap as MOV + case AArch64::MOVi32imm: + return canBeExpandedToORR(MI, 32); + case AArch64::MOVi64imm: + return canBeExpandedToORR(MI, 64); + } + + llvm_unreachable("Unknown opcode to check as cheap as a move!"); +} + +bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::SBFMXri: // aka sxtw + case AArch64::UBFMXri: // aka uxtw + // Check for the 32 -> 64 bit extension case, these instructions can do + // much more. + if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) + return false; + // This is a signed or unsigned 32 -> 64 bit extension. + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SubIdx = AArch64::sub_32; + return true; + } +} + +bool +AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned BaseRegA = 0, BaseRegB = 0; + int OffsetA = 0, OffsetB = 0; + int WidthA = 0, WidthB = 0; + + assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store."); + assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store."); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || + MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // Retrieve the base register, offset from the base register and width. Width + // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If + // base registers are identical, and the offset of a lower memory access + + // the width doesn't overlap the offset of a higher memory access, + // then the memory accesses are different. + if (getMemOpBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && + getMemOpBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { + if (BaseRegA == BaseRegB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + if (LowOffset + LowWidth <= HighOffset) + return true; + } + } + return false; +} + +/// analyzeCompare - For a comparison instruction, return the source registers +/// in SrcReg and SrcReg2, and the value it compares against in CmpValue. +/// Return true if the comparison instruction can be analyzed. +bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::SUBSWrr: + case AArch64::SUBSWrs: + case AArch64::SUBSWrx: + case AArch64::SUBSXrr: + case AArch64::SUBSXrs: + case AArch64::SUBSXrx: + case AArch64::ADDSWrr: + case AArch64::ADDSWrs: + case AArch64::ADDSWrx: + case AArch64::ADDSXrr: + case AArch64::ADDSXrs: + case AArch64::ADDSXrx: + // Replace SUBSWrr with SUBWrr if NZCV is not used. + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + // FIXME: In order to convert CmpValue to 0 or 1 + CmpValue = (MI->getOperand(2).getImm() != 0); + return true; + case AArch64::ANDSWri: + case AArch64::ANDSXri: + // ANDS does not use the same encoding scheme as the others xxxS + // instructions. + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + // FIXME:The return val type of decodeLogicalImmediate is uint64_t, + // while the type of CmpValue is int. When converting uint64_t to int, + // the high 32 bits of uint64_t will be lost. + // In fact it causes a bug in spec2006-483.xalancbmk + // CmpValue is only used to compare with zero in OptimizeCompareInstr + CmpValue = (AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), + MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0); + return true; + } + + return false; +} + +static bool UpdateOperandRegClass(MachineInstr *Instr) { + MachineBasicBlock *MBB = Instr->getParent(); + assert(MBB && "Can't get MachineBasicBlock here"); + MachineFunction *MF = MBB->getParent(); + assert(MF && "Can't get MachineFunction here"); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; + ++OpIdx) { + MachineOperand &MO = Instr->getOperand(OpIdx); + const TargetRegisterClass *OpRegCstraints = + Instr->getRegClassConstraint(OpIdx, TII, TRI); + + // If there's no constraint, there's nothing to do. + if (!OpRegCstraints) + continue; + // If the operand is a frame index, there's nothing to do here. + // A frame index operand will resolve correctly during PEI. + if (MO.isFI()) + continue; + + assert(MO.isReg() && + "Operand has register constraints without being a register!"); + + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (!OpRegCstraints->contains(Reg)) + return false; + } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && + !MRI->constrainRegClass(Reg, OpRegCstraints)) + return false; + } + + return true; +} + +/// \brief Return the opcode that does not set flags when possible - otherwise +/// return the original opcode. The caller is responsible to do the actual +/// substitution and legality checking. +static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { + // Don't convert all compare instructions, because for some the zero register + // encoding becomes the sp register. + bool MIDefinesZeroReg = false; + if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR)) + MIDefinesZeroReg = true; + + switch (MI->getOpcode()) { + default: + return MI->getOpcode(); + case AArch64::ADDSWrr: + return AArch64::ADDWrr; + case AArch64::ADDSWri: + return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; + case AArch64::ADDSWrs: + return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; + case AArch64::ADDSWrx: + return AArch64::ADDWrx; + case AArch64::ADDSXrr: + return AArch64::ADDXrr; + case AArch64::ADDSXri: + return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; + case AArch64::ADDSXrs: + return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; + case AArch64::ADDSXrx: + return AArch64::ADDXrx; + case AArch64::SUBSWrr: + return AArch64::SUBWrr; + case AArch64::SUBSWri: + return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; + case AArch64::SUBSWrs: + return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; + case AArch64::SUBSWrx: + return AArch64::SUBWrx; + case AArch64::SUBSXrr: + return AArch64::SUBXrr; + case AArch64::SUBSXri: + return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; + case AArch64::SUBSXrs: + return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; + case AArch64::SUBSXrx: + return AArch64::SUBXrx; + } +} + +/// True when condition code could be modified on the instruction +/// trace starting at from and ending at to. +static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To, + const bool CheckOnlyCCWrites, + const TargetRegisterInfo *TRI) { + // We iterate backward starting \p To until we hit \p From + MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin(); + + // Early exit if To is at the beginning of the BB. + if (I == B) + return true; + + // Check whether the definition of SrcReg is in the same basic block as + // Compare. If not, assume the condition code gets modified on some path. + if (To->getParent() != From->getParent()) + return true; + + // Check that NZCV isn't set on the trace. + for (--I; I != E; --I) { + const MachineInstr &Instr = *I; + + if (Instr.modifiesRegister(AArch64::NZCV, TRI) || + (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI))) + // This instruction modifies or uses NZCV after the one we want to + // change. + return true; + if (I == B) + // We currently don't allow the instruction trace to cross basic + // block boundaries + return true; + } + return false; +} +/// optimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register. +bool AArch64InstrInfo::optimizeCompareInstr( + MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { + + // Replace SUBSWrr with SUBWrr if NZCV is not used. + int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); + if (Cmp_NZCV != -1) { + if (CmpInstr->definesRegister(AArch64::WZR) || + CmpInstr->definesRegister(AArch64::XZR)) { + CmpInstr->eraseFromParent(); + return true; + } + unsigned Opc = CmpInstr->getOpcode(); + unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); + if (NewOpc == Opc) + return false; + const MCInstrDesc &MCID = get(NewOpc); + CmpInstr->setDesc(MCID); + CmpInstr->RemoveOperand(Cmp_NZCV); + bool succeeded = UpdateOperandRegClass(CmpInstr); + (void)succeeded; + assert(succeeded && "Some operands reg class are incompatible!"); + return true; + } + + // Continue only if we have a "ri" where immediate is zero. + // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare + // function. + assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); + if (CmpValue != 0 || SrcReg2 != 0) + return false; + + // CmpInstr is a Compare instruction if destination register is not used. + if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + return false; + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) + return false; + + bool CheckOnlyCCWrites = false; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI)) + return false; + + unsigned NewOpc = MI->getOpcode(); + switch (MI->getOpcode()) { + default: + return false; + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSWrr: + case AArch64::SUBSWri: + case AArch64::SUBSXrr: + case AArch64::SUBSXri: + break; + case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break; + case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break; + case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break; + case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break; + case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break; + case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break; + case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break; + case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break; + case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break; + case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break; + case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break; + case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break; + case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break; + case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break; + } + + // Scan forward for the use of NZCV. + // When checking against MI: if it's a conditional code requires + // checking of V bit, then this is not safe to do. + // It is safe to remove CmpInstr if NZCV is redefined or killed. + // If we are done with the basic block, we need to check whether NZCV is + // live-out. + bool IsSafe = false; + for (MachineBasicBlock::iterator I = CmpInstr, + E = CmpInstr->getParent()->end(); + !IsSafe && ++I != E;) { + const MachineInstr &Instr = *I; + for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; + ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) { + IsSafe = true; + break; + } + if (!MO.isReg() || MO.getReg() != AArch64::NZCV) + continue; + if (MO.isDef()) { + IsSafe = true; + break; + } + + // Decode the condition code. + unsigned Opc = Instr.getOpcode(); + AArch64CC::CondCode CC; + switch (Opc) { + default: + return false; + case AArch64::Bcc: + CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm(); + break; + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: + CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm(); + break; + } + + // It is not safe to remove Compare instruction if Overflow(V) is used. + switch (CC) { + default: + // NZCV can be used multiple times, we should continue. + break; + case AArch64CC::VS: + case AArch64CC::VC: + case AArch64CC::GE: + case AArch64CC::LT: + case AArch64CC::GT: + case AArch64CC::LE: + return false; + } + } + } + + // If NZCV is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if (!IsSafe) { + MachineBasicBlock *ParentBlock = CmpInstr->getParent(); + for (auto *MBB : ParentBlock->successors()) + if (MBB->isLiveIn(AArch64::NZCV)) + return false; + } + + // Update the instruction to set NZCV. + MI->setDesc(get(NewOpc)); + CmpInstr->eraseFromParent(); + bool succeeded = UpdateOperandRegClass(MI); + (void)succeeded; + assert(succeeded && "Some operands reg class are incompatible!"); + MI->addRegisterDefined(AArch64::NZCV, TRI); + return true; +} + +bool +AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD) + return false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Reg = MI->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + const TargetMachine &TM = MBB.getParent()->getTarget(); + unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); + const unsigned char MO_NC = AArch64II::MO_NC; + + if ((OpFlags & AArch64II::MO_GOT) != 0) { + BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT); + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill).addImm(0) + .addMemOperand(*MI->memoperands_begin()); + } else if (TM.getCodeModel() == CodeModel::Large) { + BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) + .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill).addImm(0) + .addMemOperand(*MI->memoperands_begin()); + } else { + BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) + .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI->memoperands_begin()); + } + + MBB.erase(MI); + + return true; +} + +/// Return true if this is this instruction has a non-zero immediate +bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::CRC32Brr: + case AArch64::CRC32CBrr: + case AArch64::CRC32CHrr: + case AArch64::CRC32CWrr: + case AArch64::CRC32CXrr: + case AArch64::CRC32Hrr: + case AArch64::CRC32Wrr: + case AArch64::CRC32Xrr: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + if (MI->getOperand(3).isImm()) { + unsigned val = MI->getOperand(3).getImm(); + return (val != 0); + } + break; + } + return false; +} + +/// Return true if this is this instruction has a non-zero immediate +bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + if (MI->getOperand(3).isImm()) { + unsigned val = MI->getOperand(3).getImm(); + return (val != 0); + } + break; + } + + return false; +} + +// Return true if this instruction simply sets its single destination register +// to zero. This is equivalent to a register rename of the zero-register. +bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::MOVZWi: + case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) + if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { + assert(MI->getDesc().getNumOperands() == 3 && + MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); + return true; + } + break; + case AArch64::ANDWri: // and Rd, Rzr, #imm + return MI->getOperand(1).getReg() == AArch64::WZR; + case AArch64::ANDXri: + return MI->getOperand(1).getReg() == AArch64::XZR; + case TargetOpcode::COPY: + return MI->getOperand(1).getReg() == AArch64::WZR; + } + return false; +} + +// Return true if this instruction simply renames a general register without +// modifying bits. +bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case TargetOpcode::COPY: { + // GPR32 copies will by lowered to ORRXrs + unsigned DstReg = MI->getOperand(0).getReg(); + return (AArch64::GPR32RegClass.contains(DstReg) || + AArch64::GPR64RegClass.contains(DstReg)); + } + case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) + if (MI->getOperand(1).getReg() == AArch64::XZR) { + assert(MI->getDesc().getNumOperands() == 4 && + MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); + return true; + } + break; + case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) + if (MI->getOperand(2).getImm() == 0) { + assert(MI->getDesc().getNumOperands() == 4 && + MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); + return true; + } + break; + } + return false; +} + +// Return true if this instruction simply renames a general register without +// modifying bits. +bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case TargetOpcode::COPY: { + // FPR64 copies will by lowered to ORR.16b + unsigned DstReg = MI->getOperand(0).getReg(); + return (AArch64::FPR64RegClass.contains(DstReg) || + AArch64::FPR128RegClass.contains(DstReg)); + } + case AArch64::ORRv16i8: + if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && + "invalid ORRv16i8 operands"); + return true; + } + break; + } + return false; +} + +unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRBui: + case AArch64::LDRHui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRBui: + case AArch64::STRHui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +/// Return true if this is load/store scales or extends its register offset. +/// This refers to scaling a dynamic index as opposed to scaled immediates. +/// MI should be a memory op that allows scaled addressing. +bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::LDRBBroW: + case AArch64::LDRBroW: + case AArch64::LDRDroW: + case AArch64::LDRHHroW: + case AArch64::LDRHroW: + case AArch64::LDRQroW: + case AArch64::LDRSBWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + case AArch64::LDRSroW: + case AArch64::LDRWroW: + case AArch64::LDRXroW: + case AArch64::STRBBroW: + case AArch64::STRBroW: + case AArch64::STRDroW: + case AArch64::STRHHroW: + case AArch64::STRHroW: + case AArch64::STRQroW: + case AArch64::STRSroW: + case AArch64::STRWroW: + case AArch64::STRXroW: + case AArch64::LDRBBroX: + case AArch64::LDRBroX: + case AArch64::LDRDroX: + case AArch64::LDRHHroX: + case AArch64::LDRHroX: + case AArch64::LDRQroX: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSroX: + case AArch64::LDRWroX: + case AArch64::LDRXroX: + case AArch64::STRBBroX: + case AArch64::STRBroX: + case AArch64::STRDroX: + case AArch64::STRHHroX: + case AArch64::STRHroX: + case AArch64::STRQroX: + case AArch64::STRSroX: + case AArch64::STRWroX: + case AArch64::STRXroX: + + unsigned Val = MI->getOperand(3).getImm(); + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); + return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); + } + return false; +} + +/// Check all MachineMemOperands for a hint to suppress pairing. +bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { + assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && + "Too many target MO flags"); + for (auto *MM : MI->memoperands()) { + if (MM->getFlags() & + (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { + return true; + } + } + return false; +} + +/// Set a flag on the first MachineMemOperand to suppress pairing. +void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { + if (MI->memoperands_empty()) + return; + + assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && + "Too many target MO flags"); + (*MI->memoperands_begin()) + ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); +} + +bool +AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + switch (LdSt->getOpcode()) { + default: + return false; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + return false; + BaseReg = LdSt->getOperand(1).getReg(); + MachineFunction &MF = *LdSt->getParent()->getParent(); + unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); + Offset = LdSt->getOperand(2).getImm() * Width; + return true; + }; +} + +bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( + MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, + const TargetRegisterInfo *TRI) const { + // Handle only loads/stores with base register followed by immediate offset. + if (LdSt->getNumOperands() != 3) + return false; + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + return false; + + // Offset is calculated as the immediate operand multiplied by the scaling factor. + // Unscaled instructions have scaling factor set to 1. + int Scale = 0; + switch (LdSt->getOpcode()) { + default: + return false; + case AArch64::LDURQi: + case AArch64::STURQi: + Width = 16; + Scale = 1; + break; + case AArch64::LDURXi: + case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: + Width = 8; + Scale = 1; + break; + case AArch64::LDURWi: + case AArch64::LDURSi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: + Width = 4; + Scale = 1; + break; + case AArch64::LDURHi: + case AArch64::LDURHHi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + case AArch64::STURHi: + case AArch64::STURHHi: + Width = 2; + Scale = 1; + break; + case AArch64::LDURBi: + case AArch64::LDURBBi: + case AArch64::LDURSBXi: + case AArch64::LDURSBWi: + case AArch64::STURBi: + case AArch64::STURBBi: + Width = 1; + Scale = 1; + break; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = Width = 16; + break; + case AArch64::LDRXui: + case AArch64::LDRDui: + case AArch64::STRXui: + case AArch64::STRDui: + Scale = Width = 8; + break; + case AArch64::LDRWui: + case AArch64::LDRSui: + case AArch64::STRWui: + case AArch64::STRSui: + Scale = Width = 4; + break; + case AArch64::LDRHui: + case AArch64::LDRHHui: + case AArch64::STRHui: + case AArch64::STRHHui: + Scale = Width = 2; + break; + case AArch64::LDRBui: + case AArch64::LDRBBui: + case AArch64::STRBui: + case AArch64::STRBBui: + Scale = Width = 1; + break; + }; + + BaseReg = LdSt->getOperand(1).getReg(); + Offset = LdSt->getOperand(2).getImm() * Scale; + return true; +} + +/// Detect opportunities for ldp/stp formation. +/// +/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. +bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + // Only cluster up to a single pair. + if (NumLoads > 1) + return false; + if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) + return false; + // getMemOpBaseRegImmOfs guarantees that oper 2 isImm. + unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); + // Allow 6 bits of positive range. + if (Ofs1 > 64) + return false; + // The caller should already have ordered First/SecondLdSt by offset. + unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); + return Ofs1 + 1 == Ofs2; +} + +bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, + MachineInstr *Second) const { + if (Subtarget.isCyclone()) { + // Cyclone can fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second->getOpcode(); + if (SecondOpcode == AArch64::Bcc) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::ANDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + case AArch64::ANDSXri: + return true; + } + } + // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + } + } + } + return false; +} + +MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( + MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, + const MDNode *Expr, DebugLoc DL) const { + MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) + .addFrameIndex(FrameIx) + .addImm(0) + .addImm(Offset) + .addMetadata(Var) + .addMetadata(Expr); + return &*MIB; +} + +static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, + unsigned Reg, unsigned SubIdx, + unsigned State, + const TargetRegisterInfo *TRI) { + if (!SubIdx) + return MIB.addReg(Reg, State); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(Reg, State, SubIdx); +} + +static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, + unsigned NumRegs) { + // We really want the positive remainder mod 32 here, that happens to be + // easily obtainable with a mask. + return ((DestReg - SrcReg) & 0x1f) < NumRegs; +} + +void AArch64InstrInfo::copyPhysRegTuple( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, + llvm::ArrayRef<unsigned> Indices) const { + assert(Subtarget.hasNEON() && + "Unexpected register copy without NEON"); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + uint16_t DestEncoding = TRI->getEncodingValue(DestReg); + uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); + unsigned NumRegs = Indices.size(); + + int SubReg = 0, End = NumRegs, Incr = 1; + if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { + SubReg = NumRegs - 1; + End = -1; + Incr = -1; + } + + for (; SubReg != End; SubReg += Incr) { + const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); + AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); + AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); + AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); + } +} + +void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + if (AArch64::GPR32spRegClass.contains(DestReg) && + (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { + // If either operand is WSP, expand to ADD #0. + if (Subtarget.hasZeroCycleRegMove()) { + // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. + unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + // This instruction is reading and writing X registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegX, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) + .addReg(SrcRegX, RegState::Undef) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } + } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + if (Subtarget.hasZeroCycleRegMove()) { + // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. + unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + // This instruction is reading and writing X registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegX, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) + .addReg(AArch64::XZR) + .addReg(SrcRegX, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + // Otherwise, expand to ORR WZR. + BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) + .addReg(AArch64::WZR) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + } + return; + } + + if (AArch64::GPR64spRegClass.contains(DestReg) && + (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { + if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { + // If either operand is SP, expand to ADD #0. + BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // Otherwise, expand to ORR XZR. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) + .addReg(AArch64::XZR) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + // Copy a DDDD register quad by copying the individual sub-registers. + if (AArch64::DDDDRegClass.contains(DestReg) && + AArch64::DDDDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a DDD register triple by copying the individual sub-registers. + if (AArch64::DDDRegClass.contains(DestReg) && + AArch64::DDDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a DD register pair by copying the individual sub-registers. + if (AArch64::DDRegClass.contains(DestReg) && + AArch64::DDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a QQQQ register quad by copying the individual sub-registers. + if (AArch64::QQQQRegClass.contains(DestReg) && + AArch64::QQQQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + // Copy a QQQ register triple by copying the individual sub-registers. + if (AArch64::QQQRegClass.contains(DestReg) && + AArch64::QQQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + // Copy a QQ register pair by copying the individual sub-registers. + if (AArch64::QQRegClass.contains(DestReg) && + AArch64::QQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + if (AArch64::FPR128RegClass.contains(DestReg) && + AArch64::FPR128RegClass.contains(SrcReg)) { + if(Subtarget.hasNEON()) { + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::STRQpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(AArch64::SP) + .addImm(-16); + BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(DestReg, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); + } + return; + } + + if (AArch64::FPR64RegClass.contains(DestReg) && + AArch64::FPR64RegClass.contains(SrcReg)) { + if(Subtarget.hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR32RegClass.contains(DestReg) && + AArch64::FPR32RegClass.contains(SrcReg)) { + if(Subtarget.hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR16RegClass.contains(DestReg) && + AArch64::FPR16RegClass.contains(SrcReg)) { + if(Subtarget.hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR8RegClass.contains(DestReg) && + AArch64::FPR8RegClass.contains(SrcReg)) { + if(Subtarget.hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + // Copies between GPR64 and FPR64. + if (AArch64::FPR64RegClass.contains(DestReg) && + AArch64::GPR64RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR64RegClass.contains(DestReg) && + AArch64::FPR64RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + // Copies between GPR32 and FPR32. + if (AArch64::FPR32RegClass.contains(DestReg) && + AArch64::GPR32RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR32RegClass.contains(DestReg) && + AArch64::FPR32RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + if (DestReg == AArch64::NZCV) { + assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); + BuildMI(MBB, I, DL, get(AArch64::MSR)) + .addImm(AArch64SysReg::NZCV) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); + return; + } + + if (SrcReg == AArch64::NZCV) { + assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); + BuildMI(MBB, I, DL, get(AArch64::MRS)) + .addReg(DestReg) + .addImm(AArch64SysReg::NZCV) + .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); + return; + } + + llvm_unreachable("unimplemented reg-to-reg copy"); +} + +void AArch64InstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); + unsigned Opc = 0; + bool Offset = true; + switch (RC->getSize()) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRHui; + break; + case 4: + if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::STRWui; + if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); + else + assert(SrcReg != AArch64::WSP); + } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRSui; + break; + case 8: + if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::STRXui; + if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); + else + assert(SrcReg != AArch64::SP); + } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRQui; + else if (AArch64::DDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Twov1d, Offset = false; + } + break; + case 24: + if (AArch64::DDDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Threev1d, Offset = false; + } + break; + case 32: + if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Fourv1d, Offset = false; + } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Twov2d, Offset = false; + } + break; + case 48: + if (AArch64::QQQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Threev2d, Offset = false; + } + break; + case 64: + if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Fourv2d, Offset = false; + } + break; + } + assert(Opc && "Unknown register class"); + + const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI); + + if (Offset) + MI.addImm(0); + MI.addMemOperand(MMO); +} + +void AArch64InstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); + + unsigned Opc = 0; + bool Offset = true; + switch (RC->getSize()) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRHui; + break; + case 4: + if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::LDRWui; + if (TargetRegisterInfo::isVirtualRegister(DestReg)) + MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); + else + assert(DestReg != AArch64::WSP); + } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRSui; + break; + case 8: + if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::LDRXui; + if (TargetRegisterInfo::isVirtualRegister(DestReg)) + MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); + else + assert(DestReg != AArch64::SP); + } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRQui; + else if (AArch64::DDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Twov1d, Offset = false; + } + break; + case 24: + if (AArch64::DDDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Threev1d, Offset = false; + } + break; + case 32: + if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Fourv1d, Offset = false; + } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Twov2d, Offset = false; + } + break; + case 48: + if (AArch64::QQQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Threev2d, Offset = false; + } + break; + case 64: + if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Fourv2d, Offset = false; + } + break; + } + assert(Opc && "Unknown register class"); + + const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc)) + .addReg(DestReg, getDefRegState(true)) + .addFrameIndex(FI); + if (Offset) + MI.addImm(0); + MI.addMemOperand(MMO); +} + +void llvm::emitFrameOffset(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, int Offset, + const TargetInstrInfo *TII, + MachineInstr::MIFlag Flag, bool SetNZCV) { + if (DestReg == SrcReg && Offset == 0) + return; + + bool isSub = Offset < 0; + if (isSub) + Offset = -Offset; + + // FIXME: If the offset won't fit in 24-bits, compute the offset into a + // scratch register. If DestReg is a virtual register, use it as the + // scratch register; otherwise, create a new virtual register (to be + // replaced by the scavenger at the end of PEI). That case can be optimized + // slightly if DestReg is SP which is always 16-byte aligned, so the scratch + // register can be loaded with offset%8 and the add/sub can use an extending + // instruction with LSL#3. + // Currently the function handles any offsets but generates a poor sequence + // of code. + // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); + + unsigned Opc; + if (SetNZCV) + Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri; + else + Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri; + const unsigned MaxEncoding = 0xfff; + const unsigned ShiftSize = 12; + const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; + while (((unsigned)Offset) >= (1 << ShiftSize)) { + unsigned ThisVal; + if (((unsigned)Offset) > MaxEncodableValue) { + ThisVal = MaxEncodableValue; + } else { + ThisVal = Offset & MaxEncodableValue; + } + assert((ThisVal >> ShiftSize) <= MaxEncoding && + "Encoding cannot handle value that big"); + BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(ThisVal >> ShiftSize) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) + .setMIFlag(Flag); + + SrcReg = DestReg; + Offset -= ThisVal; + if (Offset == 0) + return; + } + BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(Offset) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlag(Flag); +} + +MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { + // This is a bit of a hack. Consider this instruction: + // + // %vreg0<def> = COPY %SP; GPR64all:%vreg0 + // + // We explicitly chose GPR64all for the virtual register so such a copy might + // be eliminated by RegisterCoalescer. However, that may not be possible, and + // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all + // register class, TargetInstrInfo::foldMemoryOperand() is going to try. + // + // To prevent that, we are going to constrain the %vreg0 register class here. + // + // <rdar://problem/11522048> + // + if (MI->isCopy()) { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + if (SrcReg == AArch64::SP && + TargetRegisterInfo::isVirtualRegister(DstReg)) { + MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); + return nullptr; + } + if (DstReg == AArch64::SP && + TargetRegisterInfo::isVirtualRegister(SrcReg)) { + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); + return nullptr; + } + } + + // Cannot fold. + return nullptr; +} + +int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, + bool *OutUseUnscaledOp, + unsigned *OutUnscaledOp, + int *EmittableOffset) { + int Scale = 1; + bool IsSigned = false; + // The ImmIdx should be changed case by case if it is not 2. + unsigned ImmIdx = 2; + unsigned UnscaledOp = 0; + // Set output values in case of early exit. + if (EmittableOffset) + *EmittableOffset = 0; + if (OutUseUnscaledOp) + *OutUseUnscaledOp = false; + if (OutUnscaledOp) + *OutUnscaledOp = 0; + switch (MI.getOpcode()) { + default: + llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex"); + // Vector spills/fills can't take an immediate offset. + case AArch64::LD1Twov2d: + case AArch64::LD1Threev2d: + case AArch64::LD1Fourv2d: + case AArch64::LD1Twov1d: + case AArch64::LD1Threev1d: + case AArch64::LD1Fourv1d: + case AArch64::ST1Twov2d: + case AArch64::ST1Threev2d: + case AArch64::ST1Fourv2d: + case AArch64::ST1Twov1d: + case AArch64::ST1Threev1d: + case AArch64::ST1Fourv1d: + return AArch64FrameOffsetCannotUpdate; + case AArch64::PRFMui: + Scale = 8; + UnscaledOp = AArch64::PRFUMi; + break; + case AArch64::LDRXui: + Scale = 8; + UnscaledOp = AArch64::LDURXi; + break; + case AArch64::LDRWui: + Scale = 4; + UnscaledOp = AArch64::LDURWi; + break; + case AArch64::LDRBui: + Scale = 1; + UnscaledOp = AArch64::LDURBi; + break; + case AArch64::LDRHui: + Scale = 2; + UnscaledOp = AArch64::LDURHi; + break; + case AArch64::LDRSui: + Scale = 4; + UnscaledOp = AArch64::LDURSi; + break; + case AArch64::LDRDui: + Scale = 8; + UnscaledOp = AArch64::LDURDi; + break; + case AArch64::LDRQui: + Scale = 16; + UnscaledOp = AArch64::LDURQi; + break; + case AArch64::LDRBBui: + Scale = 1; + UnscaledOp = AArch64::LDURBBi; + break; + case AArch64::LDRHHui: + Scale = 2; + UnscaledOp = AArch64::LDURHHi; + break; + case AArch64::LDRSBXui: + Scale = 1; + UnscaledOp = AArch64::LDURSBXi; + break; + case AArch64::LDRSBWui: + Scale = 1; + UnscaledOp = AArch64::LDURSBWi; + break; + case AArch64::LDRSHXui: + Scale = 2; + UnscaledOp = AArch64::LDURSHXi; + break; + case AArch64::LDRSHWui: + Scale = 2; + UnscaledOp = AArch64::LDURSHWi; + break; + case AArch64::LDRSWui: + Scale = 4; + UnscaledOp = AArch64::LDURSWi; + break; + + case AArch64::STRXui: + Scale = 8; + UnscaledOp = AArch64::STURXi; + break; + case AArch64::STRWui: + Scale = 4; + UnscaledOp = AArch64::STURWi; + break; + case AArch64::STRBui: + Scale = 1; + UnscaledOp = AArch64::STURBi; + break; + case AArch64::STRHui: + Scale = 2; + UnscaledOp = AArch64::STURHi; + break; + case AArch64::STRSui: + Scale = 4; + UnscaledOp = AArch64::STURSi; + break; + case AArch64::STRDui: + Scale = 8; + UnscaledOp = AArch64::STURDi; + break; + case AArch64::STRQui: + Scale = 16; + UnscaledOp = AArch64::STURQi; + break; + case AArch64::STRBBui: + Scale = 1; + UnscaledOp = AArch64::STURBBi; + break; + case AArch64::STRHHui: + Scale = 2; + UnscaledOp = AArch64::STURHHi; + break; + + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + ImmIdx = 3; + IsSigned = true; + Scale = 8; + break; + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::LDNPQi: + case AArch64::STNPQi: + ImmIdx = 3; + IsSigned = true; + Scale = 16; + break; + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::STPWi: + case AArch64::STPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + ImmIdx = 3; + IsSigned = true; + Scale = 4; + break; + + case AArch64::LDURXi: + case AArch64::LDURWi: + case AArch64::LDURBi: + case AArch64::LDURHi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBXi: + case AArch64::LDURSBWi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + case AArch64::LDURSWi: + case AArch64::STURXi: + case AArch64::STURWi: + case AArch64::STURBi: + case AArch64::STURHi: + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + Scale = 1; + break; + } + + Offset += MI.getOperand(ImmIdx).getImm() * Scale; + + bool useUnscaledOp = false; + // If the offset doesn't match the scale, we rewrite the instruction to + // use the unscaled instruction instead. Likewise, if we have a negative + // offset (and have an unscaled op to use). + if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0)) + useUnscaledOp = true; + + // Use an unscaled addressing mode if the instruction has a negative offset + // (or if the instruction is already using an unscaled addressing mode). + unsigned MaskBits; + if (IsSigned) { + // ldp/stp instructions. + MaskBits = 7; + Offset /= Scale; + } else if (UnscaledOp == 0 || useUnscaledOp) { + MaskBits = 9; + IsSigned = true; + Scale = 1; + } else { + MaskBits = 12; + IsSigned = false; + Offset /= Scale; + } + + // Attempt to fold address computation. + int MaxOff = (1 << (MaskBits - IsSigned)) - 1; + int MinOff = (IsSigned ? (-MaxOff - 1) : 0); + if (Offset >= MinOff && Offset <= MaxOff) { + if (EmittableOffset) + *EmittableOffset = Offset; + Offset = 0; + } else { + int NewOff = Offset < 0 ? MinOff : MaxOff; + if (EmittableOffset) + *EmittableOffset = NewOff; + Offset = (Offset - NewOff) * Scale; + } + if (OutUseUnscaledOp) + *OutUseUnscaledOp = useUnscaledOp; + if (OutUnscaledOp) + *OutUnscaledOp = UnscaledOp; + return AArch64FrameOffsetCanUpdate | + (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); +} + +bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const AArch64InstrInfo *TII) { + unsigned Opcode = MI.getOpcode(); + unsigned ImmIdx = FrameRegIdx + 1; + + if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { + Offset += MI.getOperand(ImmIdx).getImm(); + emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), + MI.getOperand(0).getReg(), FrameReg, Offset, TII, + MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); + MI.eraseFromParent(); + Offset = 0; + return true; + } + + int NewOffset; + unsigned UnscaledOp; + bool UseUnscaledOp; + int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, + &UnscaledOp, &NewOffset); + if (Status & AArch64FrameOffsetCanUpdate) { + if (Status & AArch64FrameOffsetIsLegal) + // Replace the FrameIndex with FrameReg. + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (UseUnscaledOp) + MI.setDesc(TII->get(UnscaledOp)); + + MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); + return Offset == 0; + } + + return false; +} + +void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(AArch64::HINT); + NopInst.addOperand(MCOperand::createImm(0)); +} +/// useMachineCombiner - return true when a target supports MachineCombiner +bool AArch64InstrInfo::useMachineCombiner() const { + // AArch64 supports the combiner + return true; +} +// +// True when Opc sets flag +static bool isCombineInstrSettingFlag(unsigned Opc) { + switch (Opc) { + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// 32b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate32(unsigned Opc) { + switch (Opc) { + case AArch64::ADDWrr: + case AArch64::ADDWri: + case AArch64::SUBWrr: + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::SUBSWrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBWri: + case AArch64::SUBSWri: + return true; + default: + break; + } + return false; +} +// +// 64b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate64(unsigned Opc) { + switch (Opc) { + case AArch64::ADDXrr: + case AArch64::ADDXri: + case AArch64::SUBXrr: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBXri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate(unsigned Opc) { + return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); +} + +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineInstr *MI = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + MI = MRI.getUniqueVRegDef(MO.getReg()); + // And it needs to be in the trace (otherwise, it won't have a depth). + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) + return false; + + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + + // Must only used by the user we combine with. + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + return false; + + return true; +} + +/// Return true when there is potentially a faster code sequence +/// for an instruction chain ending in \p Root. All potential patterns are +/// listed +/// in the \p Pattern vector. Pattern should be sorted in priority order since +/// the pattern evaluator stops checking as soon as it finds a faster sequence. + +bool AArch64InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + if (!isCombineInstrCandidate(Opc)) + return 0; + if (isCombineInstrSettingFlag(Opc)) { + int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); + // When NZCV is live bail out. + if (Cmp_NZCV == -1) + return 0; + unsigned NewOpc = convertFlagSettingOpcode(&Root); + // When opcode can't change bail out. + // CHECKME: do we miss any cases for opcode conversion? + if (NewOpc == Opc) + return 0; + Opc = NewOpc; + } + + switch (Opc) { + default: + break; + case AArch64::ADDWrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "ADDWrr does not have register operands"); + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); + Found = true; + } + break; + case AArch64::ADDXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); + Found = true; + } + break; + case AArch64::SUBWrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); + Found = true; + } + break; + case AArch64::SUBXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); + Found = true; + } + break; + case AArch64::ADDWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); + Found = true; + } + break; + case AArch64::ADDXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); + Found = true; + } + break; + case AArch64::SUBWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); + Found = true; + } + break; + case AArch64::SUBXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); + Found = true; + } + break; + } + return Found; +} + +/// genMadd - Generate madd instruction and combine mul and add. +/// Example: +/// MUL I=A,B,0 +/// ADD R,I,C +/// ==> MADD R,A,B,C +/// \param Root is the ADD instruction +/// \param [out] InsInstrs is a vector of machine instructions and will +/// contain the generated madd instruction +/// \param IdxMulOpd is index of operand in Root that is the result of +/// the MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the madd instruction +static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned SrcReg0 = MUL->getOperand(1).getReg(); + bool Src0IsKill = MUL->getOperand(1).isKill(); + unsigned SrcReg1 = MUL->getOperand(2).getReg(); + bool Src1IsKill = MUL->getOperand(2).isKill(); + unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); + bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + MRI.constrainRegClass(SrcReg0, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + MRI.constrainRegClass(SrcReg1, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) + MRI.constrainRegClass(SrcReg2, RC); + + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), + ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + // Insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} + +/// genMaddR - Generate madd instruction and combine mul and add using +/// an extra virtual register +/// Example - an ADD intermediate needs to be stored in a register: +/// MUL I=A,B,0 +/// ADD R,I,Imm +/// ==> ORR V, ZR, Imm +/// ==> MADD R,A,B,V +/// \param Root is the ADD instruction +/// \param [out] InsInstrs is a vector of machine instructions and will +/// contain the generated madd instruction +/// \param IdxMulOpd is index of operand in Root that is the result of +/// the MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the madd instruction +/// \param VR is a virtual register that holds the value of an ADD operand +/// (V in the example above). +static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + unsigned VR, const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned SrcReg0 = MUL->getOperand(1).getReg(); + bool Src0IsKill = MUL->getOperand(1).isKill(); + unsigned SrcReg1 = MUL->getOperand(2).getReg(); + bool Src1IsKill = MUL->getOperand(2).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + MRI.constrainRegClass(SrcReg0, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + MRI.constrainRegClass(SrcReg1, RC); + if (TargetRegisterInfo::isVirtualRegister(VR)) + MRI.constrainRegClass(VR, RC); + + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), + ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(VR); + // Insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void AArch64InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + MachineInstr *MUL; + const TargetRegisterClass *RC; + unsigned Opc; + switch (Pattern) { + default: + // signal error. + break; + case MachineCombinerPattern::MULADDW_OP1: + case MachineCombinerPattern::MULADDX_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::MULADDW_OP1) { + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDW_OP2: + case MachineCombinerPattern::MULADDX_OP2: + // MUL I=A,B,0 + // ADD R,C,I + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::MULADDW_OP2) { + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDWI_OP1: + case MachineCombinerPattern::MULADDXI_OP1: { + // MUL I=A,B,0 + // ADD R,I,Imm + // ==> ORR V, ZR, Imm + // ==> MADD R,A,B,V + // --- Create(MADD); + const TargetRegisterClass *OrrRC; + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { + OrrOpc = AArch64::ORRWri; + OrrRC = &AArch64::GPR32spRegClass; + BitSize = 32; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + OrrOpc = AArch64::ORRXri; + OrrRC = &AArch64::GPR64spRegClass; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(OrrRC); + uint64_t Imm = Root.getOperand(2).getImm(); + + if (Root.getOperand(3).isImm()) { + unsigned Val = Root.getOperand(3).getImm(); + Imm = Imm << Val; + } + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + } + break; + } + case MachineCombinerPattern::MULSUBW_OP1: + case MachineCombinerPattern::MULSUBX_OP1: { + // MUL I=A,B,0 + // SUB R,I, C + // ==> SUB V, 0, C + // ==> MADD R,A,B,V // = -C + A*B + // --- Create(MADD); + const TargetRegisterClass *SubRC; + unsigned SubOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { + SubOpc = AArch64::SUBWrr; + SubRC = &AArch64::GPR32spRegClass; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + SubOpc = AArch64::SUBXrr; + SubRC = &AArch64::GPR64spRegClass; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(SubRC); + // SUB NewVR, 0, C + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) + .addReg(ZeroReg) + .addOperand(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + break; + } + case MachineCombinerPattern::MULSUBW_OP2: + case MachineCombinerPattern::MULSUBX_OP2: + // MUL I=A,B,0 + // SUB R,C,I + // ==> MSUB R,A,B,C (computes C - A*B) + // --- Create(MSUB); + if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { + Opc = AArch64::MSUBWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MSUBXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBWI_OP1: + case MachineCombinerPattern::MULSUBXI_OP1: { + // MUL I=A,B,0 + // SUB R,I, Imm + // ==> ORR V, ZR, -Imm + // ==> MADD R,A,B,V // = -Imm + A*B + // --- Create(MADD); + const TargetRegisterClass *OrrRC; + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { + OrrOpc = AArch64::ORRWri; + OrrRC = &AArch64::GPR32spRegClass; + BitSize = 32; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + OrrOpc = AArch64::ORRXri; + OrrRC = &AArch64::GPR64spRegClass; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(OrrRC); + int Imm = Root.getOperand(2).getImm(); + if (Root.getOperand(3).isImm()) { + unsigned Val = Root.getOperand(3).getImm(); + Imm = Imm << Val; + } + uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + } + break; + } + } // end switch (Pattern) + // Record MUL and ADD/SUB for deletion + DelInstrs.push_back(MUL); + DelInstrs.push_back(&Root); + + return; +} + +/// \brief Replace csincr-branch sequence by simple conditional branch +/// +/// Examples: +/// 1. +/// csinc w9, wzr, wzr, <condition code> +/// tbnz w9, #0, 0x44 +/// to +/// b.<inverted condition code> +/// +/// 2. +/// csinc w9, wzr, wzr, <condition code> +/// tbz w9, #0, 0x44 +/// to +/// b.<condition code> +/// +/// \param MI Conditional Branch +/// \return True when the simple conditional branch is generated +/// +bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { + bool IsNegativeBranch = false; + bool IsTestAndBranch = false; + unsigned TargetBBInMI = 0; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown branch instruction?"); + case AArch64::Bcc: + return false; + case AArch64::CBZW: + case AArch64::CBZX: + TargetBBInMI = 1; + break; + case AArch64::CBNZW: + case AArch64::CBNZX: + TargetBBInMI = 1; + IsNegativeBranch = true; + break; + case AArch64::TBZW: + case AArch64::TBZX: + TargetBBInMI = 2; + IsTestAndBranch = true; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + TargetBBInMI = 2; + IsNegativeBranch = true; + IsTestAndBranch = true; + break; + } + // So we increment a zero register and test for bits other + // than bit 0? Conservatively bail out in case the verifier + // missed this case. + if (IsTestAndBranch && MI->getOperand(1).getImm()) + return false; + + // Find Definition. + assert(MI->getParent() && "Incomplete machine instruciton\n"); + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + unsigned VReg = MI->getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + return false; + + MachineInstr *DefMI = MRI->getVRegDef(VReg); + + // Look for CSINC + if (!(DefMI->getOpcode() == AArch64::CSINCWr && + DefMI->getOperand(1).getReg() == AArch64::WZR && + DefMI->getOperand(2).getReg() == AArch64::WZR) && + !(DefMI->getOpcode() == AArch64::CSINCXr && + DefMI->getOperand(1).getReg() == AArch64::XZR && + DefMI->getOperand(2).getReg() == AArch64::XZR)) + return false; + + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + return false; + + AArch64CC::CondCode CC = + (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); + bool CheckOnlyCCWrites = true; + // Convert only when the condition code is not modified between + // the CSINC and the branch. The CC may be used by other + // instructions in between. + if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo())) + return false; + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB(); + DebugLoc DL = MI->getDebugLoc(); + if (IsNegativeBranch) + CC = AArch64CC::getInvertedCondCode(CC); + BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); + MI->eraseFromParent(); + return true; +} + +std::pair<unsigned, unsigned> +AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = AArch64II::MO_FRAGMENT; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PAGE, "aarch64-page"}, + {MO_PAGEOFF, "aarch64-pageoff"}, + {MO_G3, "aarch64-g3"}, + {MO_G2, "aarch64-g2"}, + {MO_G1, "aarch64-g1"}, + {MO_G0, "aarch64-g0"}, + {MO_HI12, "aarch64-hi12"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_TLS, "aarch64-tls"}, + {MO_CONSTPOOL, "aarch64-constant-pool"}}; + return makeArrayRef(TargetFlags); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h new file mode 100644 index 0000000..ae02822 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -0,0 +1,264 @@ +//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "AArch64GenInstrInfo.inc" + +namespace llvm { + +class AArch64Subtarget; +class AArch64TargetMachine; + +class AArch64InstrInfo : public AArch64GenInstrInfo { + // Reserve bits in the MachineMemOperand target hint flags, starting at 1. + // They will be shifted into MOTargetHintStart when accessed. + enum TargetMemOperandFlags { + MOSuppressPair = 1 + }; + + const AArch64RegisterInfo RI; + const AArch64Subtarget &Subtarget; + +public: + explicit AArch64InstrInfo(const AArch64Subtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + const AArch64RegisterInfo &getRegisterInfo() const { return RI; } + + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + + bool isAsCheapAsAMove(const MachineInstr *MI) const override; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; + + bool + areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + + /// Returns true if there is a shiftable register and that the shift value + /// is non-zero. + bool hasShiftedReg(const MachineInstr *MI) const; + + /// Returns true if there is an extendable register and that the extending + /// value is non-zero. + bool hasExtendedReg(const MachineInstr *MI) const; + + /// \brief Does this instruction set its full destination register to zero? + bool isGPRZero(const MachineInstr *MI) const; + + /// \brief Does this instruction rename a GPR without modifying bits? + bool isGPRCopy(const MachineInstr *MI) const; + + /// \brief Does this instruction rename an FPR without modifying bits? + bool isFPRCopy(const MachineInstr *MI) const; + + /// Return true if this is load/store scales or extends its register offset. + /// This refers to scaling a dynamic index as opposed to scaled immediates. + /// MI should be a memory op that allows scaled addressing. + bool isScaledAddr(const MachineInstr *MI) const; + + /// Return true if pairing the given load or store is hinted to be + /// unprofitable. + bool isLdStPairSuppressed(const MachineInstr *MI) const; + + /// Hint that pairing the given load or store is unprofitable. + void suppressLdStPair(MachineInstr *MI) const; + + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const override; + + bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, + int &Offset, int &Width, + const TargetRegisterInfo *TRI) const; + + bool enableClusterLoads() const override { return true; } + + bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, + unsigned NumLoads) const override; + + bool shouldScheduleAdjacent(MachineInstr *First, + MachineInstr *Second) const override; + + MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, + uint64_t Offset, const MDNode *Var, + const MDNode *Expr, DebugLoc DL) const; + void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc, unsigned Opcode, + llvm::ArrayRef<unsigned> Indices) const; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + using TargetInstrInfo::foldMemoryOperandImpl; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int &, int &, int &) const override; + void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const override; + void getNoopForMachoTarget(MCInst &NopInst) const override; + + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. + /// Return true if the comparison instruction can be analyzed. + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const override; + /// optimizeCompareInstr - Convert the instruction supplying the argument to + /// the comparison into one that sets the zero bit in the flags register. + bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const override; + bool optimizeCondBranch(MachineInstr *MI) const override; + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// listed in the <Patterns> array. + bool getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) + const override; + + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + /// useMachineCombiner - AArch64 supports MachineCombiner + bool useMachineCombiner() const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + +private: + void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, + MachineBasicBlock *TBB, + ArrayRef<MachineOperand> Cond) const; +}; + +/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg +/// plus Offset. This is intended to be used from within the prolog/epilog +/// insertion (PEI) pass, where a virtual scratch register may be allocated +/// if necessary, to be replaced by the scavenger at the end of PEI. +void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, + const TargetInstrInfo *TII, + MachineInstr::MIFlag = MachineInstr::NoFlags, + bool SetNZCV = false); + +/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the +/// FP. Return false if the offset could not be handled directly in MI, and +/// return the left-over portion by reference. +bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const AArch64InstrInfo *TII); + +/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal. +enum AArch64FrameOffsetStatus { + AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply. + AArch64FrameOffsetIsLegal = 0x1, ///< Offset is legal. + AArch64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly. +}; + +/// \brief Check if the @p Offset is a valid frame offset for @p MI. +/// The returned value reports the validity of the frame offset for @p MI. +/// It uses the values defined by AArch64FrameOffsetStatus for that. +/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to +/// use an offset.eq +/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be +/// rewriten in @p MI. +/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the +/// amount that is off the limit of the legal offset. +/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be +/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp. +/// If set, @p EmittableOffset contains the amount that can be set in @p MI +/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that +/// is a legal offset. +int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, + bool *OutUseUnscaledOp = nullptr, + unsigned *OutUnscaledOp = nullptr, + int *EmittableOffset = nullptr); + +static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; } + +static inline bool isCondBranchOpcode(int Opc) { + switch (Opc) { + case AArch64::Bcc: + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + return true; + default: + return false; + } +} + +static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; } + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td new file mode 100644 index 0000000..d02bc9f --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -0,0 +1,6004 @@ +//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Instruction definitions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, + AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "neon">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasCRC : Predicate<"Subtarget->hasCRC()">, + AssemblerPredicate<"FeatureCRC", "crc">; +def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; + +def IsLE : Predicate<"Subtarget->isLittleEndian()">; +def IsBE : Predicate<"!Subtarget->isLittleEndian()">; +def IsCyclone : Predicate<"Subtarget->isCyclone()">; + +//===----------------------------------------------------------------------===// +// AArch64-specific DAG Nodes. +// + +// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS +def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS +def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<0>, + SDTCisVT<3, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; + +def SDT_AArch64Brcond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; +def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; +def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, OtherVT>]>; + + +def SDT_AArch64CSel : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<4, i32>]>; +def SDT_AArch64CCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisInt<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; +def SDT_AArch64FCCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisFP<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; +def SDT_AArch64FCmp : SDTypeProfile<0, 2, + [SDTCisFP<0>, + SDTCisSameAs<0, 1>]>; +def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; +def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>; +def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; +def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; +def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<3>]>; +def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; + +def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; +def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; +def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; +def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>; +def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; +def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; + +def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; + +def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>]>; + +// Generates the general dynamic sequences, i.e. +// adrp x0, :tlsdesc:var +// ldr x1, [x0, #:tlsdesc_lo12:var] +// add x0, x0, #:tlsdesc_lo12:var +// .tlsdesccall var +// blr x1 + +// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here) +// number of operands (the variable) +def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1, + [SDTCisPtrTy<0>]>; + +def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4, + [SDTCisVT<0, i64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, + SDTCisSameAs<1, 4>]>; + + +// Node definitions. +def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; +def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; +def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; +def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + [SDNPHasChain, SDNPOutGlue]>; +def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64call : SDNode<"AArch64ISD::CALL", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond, + [SDNPHasChain]>; +def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz, + [SDNPHasChain]>; +def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz, + [SDNPHasChain]>; +def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz, + [SDNPHasChain]>; +def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz, + [SDNPHasChain]>; + + +def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>; +def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>; +def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>; +def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>; +def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >; +def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>; +def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut, + [SDNPCommutative]>; +def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>; +def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, + [SDNPCommutative]>; +def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; +def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; + +def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>; +def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>; +def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; + +def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; + +def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; + +def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; +def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; +def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; +def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>; +def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>; + +def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>; +def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>; +def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>; +def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>; +def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>; +def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>; + +def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>; +def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>; +def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>; +def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>; +def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; +def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; +def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; + +def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; +def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; +def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; +def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>; + +def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>; +def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>; +def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>; +def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>; +def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; +def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; +def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; +def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; + +def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; +def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; +def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; + +def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; +def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; +def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>; +def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>; +def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>; + +def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>; +def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>; +def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>; + +def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>; +def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>; +def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>; +def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>; +def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>; +def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), + (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>; + +def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; +def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; +def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>; +def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>; +def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>; + +def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>; +def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>; + +def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>; + +def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH, + [SDNPHasChain, SDNPSideEffect]>; + +def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>; +def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>; + +def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ", + SDT_AArch64TLSDescCallSeq, + [SDNPInGlue, SDNPOutGlue, SDNPHasChain, + SDNPVariadic]>; + + +def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", + SDT_AArch64WrapperLarge>; + +def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>; + +def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; +def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; + +def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; +def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; +def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; +def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; +def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; +def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// + +// AArch64 Instruction Predicate Definitions. +// +def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; +def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; +def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; +def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; +def ForCodeSize : Predicate<"ForCodeSize">; +def NotForCodeSize : Predicate<"!ForCodeSize">; + +include "AArch64InstrFormats.td" + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous instructions. +//===----------------------------------------------------------------------===// + +let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), + [(AArch64callseq_start timm:$amt)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>; +} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 + +let isReMaterializable = 1, isCodeGenOnly = 1 in { +// FIXME: The following pseudo instructions are only needed because remat +// cannot handle multiple instructions. When that changes, they can be +// removed, along with the AArch64Wrapper node. + +let AddedComplexity = 10 in +def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr), + [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>, + Sched<[WriteLDAdr]>; + +// The MOVaddr instruction should match only when the add is not folded +// into a load or store address. +def MOVaddr + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi), + tglobaladdr:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrJT + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi), + tjumptable:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrCP + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi), + tconstpool:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrBA + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi), + tblockaddress:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrTLS + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi), + tglobaltlsaddr:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrEXT + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi), + texternalsym:$low))]>, + Sched<[WriteAdrAdr]>; + +} // isReMaterializable, isCodeGenOnly + +def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr), + (LOADgot tglobaltlsaddr:$addr)>; + +def : Pat<(AArch64LOADgot texternalsym:$addr), + (LOADgot texternalsym:$addr)>; + +def : Pat<(AArch64LOADgot tconstpool:$addr), + (LOADgot tconstpool:$addr)>; + +//===----------------------------------------------------------------------===// +// System instructions. +//===----------------------------------------------------------------------===// + +def HINT : HintI<"hint">; +def : InstAlias<"nop", (HINT 0b000)>; +def : InstAlias<"yield",(HINT 0b001)>; +def : InstAlias<"wfe", (HINT 0b010)>; +def : InstAlias<"wfi", (HINT 0b011)>; +def : InstAlias<"sev", (HINT 0b100)>; +def : InstAlias<"sevl", (HINT 0b101)>; + +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + +// As far as LLVM is concerned this writes to the system's exclusive monitors. +let mayLoad = 1, mayStore = 1 in +def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">; + +// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot +// model patterns with sufficiently fine granularity. +let mayLoad = ?, mayStore = ? in { +def DMB : CRmSystemI<barrier_op, 0b101, "dmb", + [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>; + +def DSB : CRmSystemI<barrier_op, 0b100, "dsb", + [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>; + +def ISB : CRmSystemI<barrier_op, 0b110, "isb", + [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>; +} + +def : InstAlias<"clrex", (CLREX 0xf)>; +def : InstAlias<"isb", (ISB 0xf)>; + +def MRS : MRSI; +def MSR : MSRI; +def MSRpstateImm1 : MSRpstateImm0_1; +def MSRpstateImm4 : MSRpstateImm0_15; + +// The thread pointer (on Linux, at least, where this has been implemented) is +// TPIDR_EL0. +def : Pat<(AArch64threadpointer), (MRS 0xde82)>; + +// The cycle counter PMC register is PMCCNTR_EL0. +let Predicates = [HasPerfMon] in +def : Pat<(readcyclecounter), (MRS 0xdce8)>; + +// Generic system instructions +def SYSxt : SystemXtI<0, "sys">; +def SYSLxt : SystemLXtI<1, "sysl">; + +def : InstAlias<"sys $op1, $Cn, $Cm, $op2", + (SYSxt imm0_7:$op1, sys_cr_op:$Cn, + sys_cr_op:$Cm, imm0_7:$op2, XZR)>; + +//===----------------------------------------------------------------------===// +// Move immediate instructions. +//===----------------------------------------------------------------------===// + +defm MOVK : InsertImmediate<0b11, "movk">; +defm MOVN : MoveImmediate<0b00, "movn">; + +let PostEncoderMethod = "fixMOVZ" in +defm MOVZ : MoveImmediate<0b10, "movz">; + +// First group of aliases covers an implicit "lsl #0". +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; + +// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>; + +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>; + +// Final group of aliases covers true "mov $Rd, $imm" cases. +multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR, + int width, int shift> { + def _asmoperand : AsmOperandClass { + let Name = basename # width # "_lsl" # shift # "MovAlias"; + let PredicateMethod = "is" # basename # "MovAlias<" # width # ", " + # shift # ">"; + let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">"; + } + + def _movimm : Operand<i32> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand"); + } + + def : InstAlias<"mov $Rd, $imm", + (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>; +} + +defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>; +defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>; + +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>; + +defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>; +defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>; + +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>; + +let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1, + isAsCheapAsAMove = 1 in { +// FIXME: The following pseudo instructions are only needed because remat +// cannot handle multiple instructions. When that changes, we can select +// directly to the real instructions and get rid of these pseudos. + +def MOVi32imm + : Pseudo<(outs GPR32:$dst), (ins i32imm:$src), + [(set GPR32:$dst, imm:$src)]>, + Sched<[WriteImm]>; +def MOVi64imm + : Pseudo<(outs GPR64:$dst), (ins i64imm:$src), + [(set GPR64:$dst, imm:$src)]>, + Sched<[WriteImm]>; +} // isReMaterializable, isCodeGenOnly + +// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the +// eventual expansion code fewer bits to worry about getting right. Marshalling +// the types is a little tricky though: +def i64imm_32bit : ImmLeaf<i64, [{ + return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm); +}]>; + +def trunc_imm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def : Pat<(i64 i64imm_32bit:$src), + (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; + +// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); +}]>; + + +def : Pat<(f32 fpimm:$in), + (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>; +def : Pat<(f64 fpimm:$in), + (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>; + + +// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK +// sequences. +def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, + tglobaladdr:$g1, tglobaladdr:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), + tglobaladdr:$g2, 32), + tglobaladdr:$g1, 16), + tglobaladdr:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, + tblockaddress:$g1, tblockaddress:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), + tblockaddress:$g2, 32), + tblockaddress:$g1, 16), + tblockaddress:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2, + tconstpool:$g1, tconstpool:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), + tconstpool:$g2, 32), + tconstpool:$g1, 16), + tconstpool:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2, + tjumptable:$g1, tjumptable:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), + tjumptable:$g2, 32), + tjumptable:$g1, 16), + tjumptable:$g0, 0)>; + + +//===----------------------------------------------------------------------===// +// Arithmetic instructions. +//===----------------------------------------------------------------------===// + +// Add/subtract with carry. +defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>; +defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>; + +def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>; +def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>; +def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>; +def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>; + +// Add/subtract +defm ADD : AddSub<0, "add", "sub", add>; +defm SUB : AddSub<1, "sub", "add">; + +def : InstAlias<"mov $dst, $src", + (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>; + +defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">; +defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">; + +// Use SUBS instead of SUB to enable CSE between SUBS and SUB. +def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm), + (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>; +def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm), + (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>; +def : Pat<(sub GPR32:$Rn, GPR32:$Rm), + (SUBSWrr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(sub GPR64:$Rn, GPR64:$Rm), + (SUBSXrr GPR64:$Rn, GPR64:$Rm)>; +def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), + (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; +def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), + (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; +let AddedComplexity = 1 in { +def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), + (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; +def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), + (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +} + +// Because of the immediate format for add/sub-imm instructions, the +// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). +// These patterns capture that transformation. +let AddedComplexity = 1 in { +def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +} + +// Because of the immediate format for add/sub-imm instructions, the +// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). +// These patterns capture that transformation. +let AddedComplexity = 1 in { +def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +} + +def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; +def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; +def : InstAlias<"neg $dst, $src$shift", + (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; +def : InstAlias<"neg $dst, $src$shift", + (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; + +def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; +def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; +def : InstAlias<"negs $dst, $src$shift", + (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; +def : InstAlias<"negs $dst, $src$shift", + (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; + + +// Unsigned/Signed divide +defm UDIV : Div<0, "udiv", udiv>; +defm SDIV : Div<1, "sdiv", sdiv>; +let isCodeGenOnly = 1 in { +defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>; +defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>; +} + +// Variable shift +defm ASRV : Shift<0b10, "asr", sra>; +defm LSLV : Shift<0b00, "lsl", shl>; +defm LSRV : Shift<0b01, "lsr", srl>; +defm RORV : Shift<0b11, "ror", rotr>; + +def : ShiftAlias<"asrv", ASRVWr, GPR32>; +def : ShiftAlias<"asrv", ASRVXr, GPR64>; +def : ShiftAlias<"lslv", LSLVWr, GPR32>; +def : ShiftAlias<"lslv", LSLVXr, GPR64>; +def : ShiftAlias<"lsrv", LSRVWr, GPR32>; +def : ShiftAlias<"lsrv", LSRVXr, GPR64>; +def : ShiftAlias<"rorv", RORVWr, GPR32>; +def : ShiftAlias<"rorv", RORVXr, GPR64>; + +// Multiply-add +let AddedComplexity = 7 in { +defm MADD : MulAccum<0, "madd", add>; +defm MSUB : MulAccum<1, "msub", sub>; + +def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)), + (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)), + (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; + +def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))), + (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))), + (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; +def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), + (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), + (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; +} // AddedComplexity = 7 + +let AddedComplexity = 5 in { +def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; +def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>; +def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>; +def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>; + +def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))), + (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))), + (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; + +def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), + (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), + (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +} // AddedComplexity = 5 + +def : MulAccumWAlias<"mul", MADDWrrr>; +def : MulAccumXAlias<"mul", MADDXrrr>; +def : MulAccumWAlias<"mneg", MSUBWrrr>; +def : MulAccumXAlias<"mneg", MSUBXrrr>; +def : WideMulAccumAlias<"smull", SMADDLrrr>; +def : WideMulAccumAlias<"smnegl", SMSUBLrrr>; +def : WideMulAccumAlias<"umull", UMADDLrrr>; +def : WideMulAccumAlias<"umnegl", UMSUBLrrr>; + +// Multiply-high +def SMULHrr : MulHi<0b010, "smulh", mulhs>; +def UMULHrr : MulHi<0b110, "umulh", mulhu>; + +// CRC32 +def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">; +def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">; +def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">; +def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">; + +def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">; +def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">; +def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">; +def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">; + +// v8.1 atomic CAS +defm CAS : CompareAndSwap<0, 0, "">; +defm CASA : CompareAndSwap<1, 0, "a">; +defm CASL : CompareAndSwap<0, 1, "l">; +defm CASAL : CompareAndSwap<1, 1, "al">; + +// v8.1 atomic CASP +defm CASP : CompareAndSwapPair<0, 0, "">; +defm CASPA : CompareAndSwapPair<1, 0, "a">; +defm CASPL : CompareAndSwapPair<0, 1, "l">; +defm CASPAL : CompareAndSwapPair<1, 1, "al">; + +// v8.1 atomic SWP +defm SWP : Swap<0, 0, "">; +defm SWPA : Swap<1, 0, "a">; +defm SWPL : Swap<0, 1, "l">; +defm SWPAL : Swap<1, 1, "al">; + +// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register) +defm LDADD : LDOPregister<0b000, "add", 0, 0, "">; +defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">; +defm LDADDL : LDOPregister<0b000, "add", 0, 1, "l">; +defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">; + +defm LDCLR : LDOPregister<0b001, "clr", 0, 0, "">; +defm LDCLRA : LDOPregister<0b001, "clr", 1, 0, "a">; +defm LDCLRL : LDOPregister<0b001, "clr", 0, 1, "l">; +defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">; + +defm LDEOR : LDOPregister<0b010, "eor", 0, 0, "">; +defm LDEORA : LDOPregister<0b010, "eor", 1, 0, "a">; +defm LDEORL : LDOPregister<0b010, "eor", 0, 1, "l">; +defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">; + +defm LDSET : LDOPregister<0b011, "set", 0, 0, "">; +defm LDSETA : LDOPregister<0b011, "set", 1, 0, "a">; +defm LDSETL : LDOPregister<0b011, "set", 0, 1, "l">; +defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">; + +defm LDSMAX : LDOPregister<0b100, "smax", 0, 0, "">; +defm LDSMAXA : LDOPregister<0b100, "smax", 1, 0, "a">; +defm LDSMAXL : LDOPregister<0b100, "smax", 0, 1, "l">; +defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">; + +defm LDSMIN : LDOPregister<0b101, "smin", 0, 0, "">; +defm LDSMINA : LDOPregister<0b101, "smin", 1, 0, "a">; +defm LDSMINL : LDOPregister<0b101, "smin", 0, 1, "l">; +defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">; + +defm LDUMAX : LDOPregister<0b110, "umax", 0, 0, "">; +defm LDUMAXA : LDOPregister<0b110, "umax", 1, 0, "a">; +defm LDUMAXL : LDOPregister<0b110, "umax", 0, 1, "l">; +defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">; + +defm LDUMIN : LDOPregister<0b111, "umin", 0, 0, "">; +defm LDUMINA : LDOPregister<0b111, "umin", 1, 0, "a">; +defm LDUMINL : LDOPregister<0b111, "umin", 0, 1, "l">; +defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">; + +// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR" +defm : STOPregister<"stadd","LDADD">; // STADDx +defm : STOPregister<"stclr","LDCLR">; // STCLRx +defm : STOPregister<"steor","LDEOR">; // STEORx +defm : STOPregister<"stset","LDSET">; // STSETx +defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx +defm : STOPregister<"stsmin","LDSMIN">;// STSMINx +defm : STOPregister<"stumax","LDUMAX">;// STUMAXx +defm : STOPregister<"stumin","LDUMIN">;// STUMINx + +//===----------------------------------------------------------------------===// +// Logical instructions. +//===----------------------------------------------------------------------===// + +// (immediate) +defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">; +defm AND : LogicalImm<0b00, "and", and, "bic">; +defm EOR : LogicalImm<0b10, "eor", xor, "eon">; +defm ORR : LogicalImm<0b01, "orr", or, "orn">; + +// FIXME: these aliases *are* canonical sometimes (when movz can't be +// used). Actually, it seems to be working right now, but putting logical_immXX +// here is a bit dodgy on the AsmParser side too. +def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR, + logical_imm32:$imm), 0>; +def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR, + logical_imm64:$imm), 0>; + + +// (register) +defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>; +defm BICS : LogicalRegS<0b11, 1, "bics", + BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>; +defm AND : LogicalReg<0b00, 0, "and", and>; +defm BIC : LogicalReg<0b00, 1, "bic", + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; +defm EON : LogicalReg<0b10, 1, "eon", + BinOpFrag<(not (xor node:$LHS, node:$RHS))>>; +defm EOR : LogicalReg<0b10, 0, "eor", xor>; +defm ORN : LogicalReg<0b01, 1, "orn", + BinOpFrag<(or node:$LHS, (not node:$RHS))>>; +defm ORR : LogicalReg<0b01, 0, "orr", or>; + +def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>; +def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>; + +def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>; +def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>; + +def : InstAlias<"mvn $Wd, $Wm$sh", + (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>; +def : InstAlias<"mvn $Xd, $Xm$sh", + (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>; + +def : InstAlias<"tst $src1, $src2", + (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>; +def : InstAlias<"tst $src1, $src2", + (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>; + +def : InstAlias<"tst $src1, $src2", + (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>; +def : InstAlias<"tst $src1, $src2", + (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>; + +def : InstAlias<"tst $src1, $src2$sh", + (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>; +def : InstAlias<"tst $src1, $src2$sh", + (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>; + + +def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>; +def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; + + +//===----------------------------------------------------------------------===// +// One operand data processing instructions. +//===----------------------------------------------------------------------===// + +defm CLS : OneOperandData<0b101, "cls">; +defm CLZ : OneOperandData<0b100, "clz", ctlz>; +defm RBIT : OneOperandData<0b000, "rbit">; + +def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>; +def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>; + +def REV16Wr : OneWRegData<0b001, "rev16", + UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; +def REV16Xr : OneXRegData<0b001, "rev16", null_frag>; + +def : Pat<(cttz GPR32:$Rn), + (CLZWr (RBITWr GPR32:$Rn))>; +def : Pat<(cttz GPR64:$Rn), + (CLZXr (RBITXr GPR64:$Rn))>; +def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), + (i32 1))), + (CLSWr GPR32:$Rn)>; +def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), + (i64 1))), + (CLSXr GPR64:$Rn)>; + +// Unlike the other one operand instructions, the instructions with the "rev" +// mnemonic do *not* just different in the size bit, but actually use different +// opcode bits for the different sizes. +def REVWr : OneWRegData<0b010, "rev", bswap>; +def REVXr : OneXRegData<0b011, "rev", bswap>; +def REV32Xr : OneXRegData<0b010, "rev32", + UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>; + +def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>; + +// The bswap commutes with the rotr so we want a pattern for both possible +// orders. +def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; +def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; + +//===----------------------------------------------------------------------===// +// Bitfield immediate extraction instruction. +//===----------------------------------------------------------------------===// +let hasSideEffects = 0 in +defm EXTR : ExtractImm<"extr">; +def : InstAlias<"ror $dst, $src, $shift", + (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>; +def : InstAlias<"ror $dst, $src, $shift", + (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>; + +def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)), + (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>; +def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)), + (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>; + +//===----------------------------------------------------------------------===// +// Other bitfield immediate instructions. +//===----------------------------------------------------------------------===// +let hasSideEffects = 0 in { +defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">; +defm SBFM : BitfieldImm<0b00, "sbfm">; +defm UBFM : BitfieldImm<0b10, "ubfm">; +} + +def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = (32 - N->getZExtValue()) & 0x1f; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 31 - N->getZExtValue(); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +// min(7, 31 - shift_amt) +def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 31 - N->getZExtValue(); + enc = enc > 7 ? 7 : enc; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +// min(15, 31 - shift_amt) +def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 31 - N->getZExtValue(); + enc = enc > 15 ? 15 : enc; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = (64 - N->getZExtValue()) & 0x3f; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 63 - N->getZExtValue(); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +// min(7, 63 - shift_amt) +def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 63 - N->getZExtValue(); + enc = enc > 7 ? 7 : enc; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +// min(15, 63 - shift_amt) +def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 63 - N->getZExtValue(); + enc = enc > 15 ? 15 : enc; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +// min(31, 63 - shift_amt) +def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{ + uint64_t enc = 63 - N->getZExtValue(); + enc = enc > 31 ? 31 : enc; + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64); +}]>; + +def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)), + (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_b imm0_31:$imm)))>; +def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)), + (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_b imm0_63:$imm)))>; + +let AddedComplexity = 10 in { +def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; +def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; +} + +def : InstAlias<"asr $dst, $src, $shift", + (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; +def : InstAlias<"asr $dst, $src, $shift", + (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; +def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; +def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; +def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; +def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; +def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; + +def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)), + (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; +def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)), + (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; + +def : InstAlias<"lsr $dst, $src, $shift", + (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; +def : InstAlias<"lsr $dst, $src, $shift", + (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; +def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; +def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; +def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; +def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; +def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; + +//===----------------------------------------------------------------------===// +// Conditional comparison instructions. +//===----------------------------------------------------------------------===// +defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>; +defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>; + +//===----------------------------------------------------------------------===// +// Conditional select instructions. +//===----------------------------------------------------------------------===// +defm CSEL : CondSelect<0, 0b00, "csel">; + +def inc : PatFrag<(ops node:$in), (add node:$in, 1)>; +defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>; +defm CSINV : CondSelectOp<1, 0b00, "csinv", not>; +defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>; + +def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; + +def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV), + (CSINCWr WZR, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV), + (CSINCXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), + (CSINVWr WZR, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), + (CSINVXr XZR, XZR, (i32 imm:$cc))>; + +// The inverse of the condition code from the alias instruction is what is used +// in the aliased instruction. The parser all ready inverts the condition code +// for these aliases. +def : InstAlias<"cset $dst, $cc", + (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; +def : InstAlias<"cset $dst, $cc", + (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; + +def : InstAlias<"csetm $dst, $cc", + (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; +def : InstAlias<"csetm $dst, $cc", + (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; + +def : InstAlias<"cinc $dst, $src, $cc", + (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cinc $dst, $src, $cc", + (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +def : InstAlias<"cinv $dst, $src, $cc", + (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cinv $dst, $src, $cc", + (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +def : InstAlias<"cneg $dst, $src, $cc", + (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cneg $dst, $src, $cc", + (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +//===----------------------------------------------------------------------===// +// PC-relative instructions. +//===----------------------------------------------------------------------===// +let isReMaterializable = 1 in { +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { +def ADR : ADRI<0, "adr", adrlabel, []>; +} // hasSideEffects = 0 + +def ADRP : ADRI<1, "adrp", adrplabel, + [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>; +} // isReMaterializable = 1 + +// page address of a constant pool entry, block address +def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>; +def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>; + +//===----------------------------------------------------------------------===// +// Unconditional branch (register) instructions. +//===----------------------------------------------------------------------===// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { +def RET : BranchReg<0b0010, "ret", []>; +def DRPS : SpecialReturn<0b0101, "drps">; +def ERET : SpecialReturn<0b0100, "eret">; +} // isReturn = 1, isTerminator = 1, isBarrier = 1 + +// Default to the LR register. +def : InstAlias<"ret", (RET LR)>; + +let isCall = 1, Defs = [LR], Uses = [SP] in { +def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; +} // isCall + +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { +def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; +} // isBranch, isTerminator, isBarrier, isIndirectBranch + +// Create a separate pseudo-instruction for codegen to use so that we don't +// flag lr as used in every function. It'll be restored before the RET by the +// epilogue if it's legitimately used. +def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; +} + +// This is a directive-like pseudo-instruction. The purpose is to insert an +// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction +// (which in the usual case is a BLR). +let hasSideEffects = 1 in +def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { + let AsmString = ".tlsdesccall $sym"; +} + +// FIXME: maybe the scratch register used shouldn't be fixed to X1? +// FIXME: can "hasSideEffects be dropped? +let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, + isCodeGenOnly = 1 in +def TLSDESC_CALLSEQ + : Pseudo<(outs), (ins i64imm:$sym), + [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>; +def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), + (TLSDESC_CALLSEQ texternalsym:$sym)>; + +//===----------------------------------------------------------------------===// +// Conditional branch (immediate) instruction. +//===----------------------------------------------------------------------===// +def Bcc : BranchCond; + +//===----------------------------------------------------------------------===// +// Compare-and-branch instructions. +//===----------------------------------------------------------------------===// +defm CBZ : CmpBranch<0, "cbz", AArch64cbz>; +defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>; + +//===----------------------------------------------------------------------===// +// Test-bit-and-branch instructions. +//===----------------------------------------------------------------------===// +defm TBZ : TestBranch<0, "tbz", AArch64tbz>; +defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>; + +//===----------------------------------------------------------------------===// +// Unconditional branch (immediate) instructions. +//===----------------------------------------------------------------------===// +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { +def B : BranchImm<0, "b", [(br bb:$addr)]>; +} // isBranch, isTerminator, isBarrier + +let isCall = 1, Defs = [LR], Uses = [SP] in { +def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>; +} // isCall +def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>; + +//===----------------------------------------------------------------------===// +// Exception generation instructions. +//===----------------------------------------------------------------------===// +def BRK : ExceptionGeneration<0b001, 0b00, "brk">; +def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">; +def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">; +def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">; +def HLT : ExceptionGeneration<0b010, 0b00, "hlt">; +def HVC : ExceptionGeneration<0b000, 0b10, "hvc">; +def SMC : ExceptionGeneration<0b000, 0b11, "smc">; +def SVC : ExceptionGeneration<0b000, 0b01, "svc">; + +// DCPSn defaults to an immediate operand of zero if unspecified. +def : InstAlias<"dcps1", (DCPS1 0)>; +def : InstAlias<"dcps2", (DCPS2 0)>; +def : InstAlias<"dcps3", (DCPS3 0)>; + +//===----------------------------------------------------------------------===// +// Load instructions. +//===----------------------------------------------------------------------===// + +// Pair (indexed, offset) +defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">; +defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">; +defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">; +defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">; +defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">; + +defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">; + +// Pair (pre-indexed) +def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">; +def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">; +def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">; +def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">; +def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">; + +def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; + +// Pair (post-indexed) +def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">; +def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">; +def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">; +def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">; +def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">; + +def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; + + +// Pair (no allocate) +defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">; +defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">; +defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">; +defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">; +defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">; + +//--- +// (register offset) +//--- + +// Integer +defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>; +defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>; +defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>; +defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>; + +// Floating-point +defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>; +defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>; +defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>; +defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>; +defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>; + +// Load sign-extended half-word +defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>; +defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>; + +// Load sign-extended byte +defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>; +defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>; + +// Load sign-extended word +defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>; + +// Pre-fetch. +defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">; + +// For regular load, we do not have any alignment requirement. +// Thus, it is safe to directly map the vector loads with interesting +// addressing modes. +// FIXME: We could do the same for bitconvert to floating point vectors. +multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop, + ValueType ScalTy, ValueType VecTy, + Instruction LOADW, Instruction LOADX, + SubRegIndex sub> { + def : Pat<(VecTy (scalar_to_vector (ScalTy + (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))), + (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), + (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset), + sub)>; + + def : Pat<(VecTy (scalar_to_vector (ScalTy + (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))), + (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), + (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset), + sub)>; +} + +let AddedComplexity = 10 in { +defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>; +defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>; + +defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>; +defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>; + +defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>; +defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>; + +defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>; +defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>; + +defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>; +defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>; + +defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>; + +defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>; + + +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))))), + (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; + +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))))), + (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +} + +// Match all load 64 bits width whose type is compatible with FPR64 +multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy, + Instruction LOADW, Instruction LOADX> { + + def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { +let Predicates = [IsLE] in { + // We must do vector loads with LD1 in big-endian. + defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>; +} + +defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>; +defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must do vector loads with LD1 in big-endian. + defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>; +} +} // AddedComplexity = 10 + +// zextload -> i64 +multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop, + Instruction INSTW, Instruction INSTX> { + def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (SUBREG_TO_REG (i64 0), + (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), + sub_32)>; + + def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (SUBREG_TO_REG (i64 0), + (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), + sub_32)>; +} + +let AddedComplexity = 10 in { + defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>; + defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>; + defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>; + + // zextloadi1 -> zextloadi8 + defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>; + + // extload -> zextload + defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>; + defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>; + defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>; + + // extloadi1 -> zextloadi8 + defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>; +} + + +// zextload -> i64 +multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop, + Instruction INSTW, Instruction INSTX> { + def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; + +} + +let AddedComplexity = 10 in { + // extload -> zextload + defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>; + defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>; + defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>; + + // zextloadi1 -> zextloadi8 + defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>; +} + +//--- +// (unsigned immediate) +//--- +defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr", + [(set GPR64:$Rt, + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; +defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr", + [(set GPR32:$Rt, + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; +defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr", + [(set FPR8:$Rt, + (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>; +defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr", + [(set (f16 FPR16:$Rt), + (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>; +defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr", + [(set (f32 FPR32:$Rt), + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; +defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr", + [(set (f64 FPR64:$Rt), + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; +defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr", + [(set (f128 FPR128:$Rt), + (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; + +// For regular load, we do not have any alignment requirement. +// Thus, it is safe to directly map the vector loads with interesting +// addressing modes. +// FIXME: We could do the same for bitconvert to floating point vectors. +def : Pat <(v8i8 (scalar_to_vector (i32 + (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat <(v16i8 (scalar_to_vector (i32 + (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat <(v4i16 (scalar_to_vector (i32 + (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +def : Pat <(v8i16 (scalar_to_vector (i32 + (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +def : Pat <(v2i32 (scalar_to_vector (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +def : Pat <(v4i32 (scalar_to_vector (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat <(v2i64 (scalar_to_vector (i64 + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>; + +// Match all load 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use LD1 to perform vector loads in big-endian. + def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +} +def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use LD1 to perform vector loads in big-endian. + def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; +} +def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + +defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh", + [(set GPR32:$Rt, + (zextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; +defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb", + [(set GPR32:$Rt, + (zextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; +// zextload -> i64 +def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; + +// zextloadi1 -> zextloadi8 +def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; + +// extload -> zextload +def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), + (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; +def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; +def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; +def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; + +// load sign-extended half-word +defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh", + [(set GPR32:$Rt, + (sextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; +defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh", + [(set GPR64:$Rt, + (sextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; + +// load sign-extended byte +defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb", + [(set GPR32:$Rt, + (sextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; +defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb", + [(set GPR64:$Rt, + (sextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; + +// load sign-extended word +defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", + [(set GPR64:$Rt, + (sextloadi32 (am_indexed32 GPR64sp:$Rn, + uimm12s4:$offset)))]>; + +// load zero-extended word +def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), + (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; + +// Pre-fetch. +def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", + [(AArch64Prefetch imm:$Rt, + (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset))]>; + +def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; + +//--- +// (literal) +def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">; +def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">; +def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">; +def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">; +def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">; + +// load sign-extended word +def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">; + +// prefetch +def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>; +// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>; + +//--- +// (unscaled immediate) +defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur", + [(set GPR64:$Rt, + (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur", + [(set GPR32:$Rt, + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur", + [(set FPR8:$Rt, + (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur", + [(set FPR16:$Rt, + (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur", + [(set (f32 FPR32:$Rt), + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur", + [(set (f64 FPR64:$Rt), + (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur", + [(set (f128 FPR128:$Rt), + (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>; + +defm LDURHH + : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh", + [(set GPR32:$Rt, + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURBB + : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb", + [(set GPR32:$Rt, + (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; + +// Match all load 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; +} +def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; +} + +// anyext -> zext +def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +// unscaled zext +def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; + + +//--- +// LDR mnemonics fall back to LDUR for negative or unaligned offsets. + +// Define new assembler match classes as we want to only match these when +// the don't otherwise match the scaled addressing mode for LDR/STR. Don't +// associate a DiagnosticType either, as we want the diagnostic for the +// canonical form (the scaled operand) to take precedence. +class SImm9OffsetOperand<int Width> : AsmOperandClass { + let Name = "SImm9OffsetFB" # Width; + let PredicateMethod = "isSImm9OffsetFB<" # Width # ">"; + let RenderMethod = "addImmOperands"; +} + +def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>; +def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>; +def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>; +def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>; +def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>; + +def simm9_offset_fb8 : Operand<i64> { + let ParserMatchClass = SImm9OffsetFB8Operand; +} +def simm9_offset_fb16 : Operand<i64> { + let ParserMatchClass = SImm9OffsetFB16Operand; +} +def simm9_offset_fb32 : Operand<i64> { + let ParserMatchClass = SImm9OffsetFB32Operand; +} +def simm9_offset_fb64 : Operand<i64> { + let ParserMatchClass = SImm9OffsetFB64Operand; +} +def simm9_offset_fb128 : Operand<i64> { + let ParserMatchClass = SImm9OffsetFB128Operand; +} + +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; + +// zextload -> i64 +def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; + +// load sign-extended half-word +defm LDURSHW + : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh", + [(set GPR32:$Rt, + (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURSHX + : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh", + [(set GPR64:$Rt, + (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; + +// load sign-extended byte +defm LDURSBW + : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb", + [(set GPR32:$Rt, + (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURSBX + : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb", + [(set GPR64:$Rt, + (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; + +// load sign-extended word +defm LDURSW + : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw", + [(set GPR64:$Rt, + (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; + +// zero and sign extending aliases from generic LDR* mnemonics to LDUR*. +def : InstAlias<"ldrb $Rt, [$Rn, $offset]", + (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrh $Rt, [$Rn, $offset]", + (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", + (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", + (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", + (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", + (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsw $Rt, [$Rn, $offset]", + (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; + +// Pre-fetch. +defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", + [(AArch64Prefetch imm:$Rt, + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; + +//--- +// (unscaled immediate, unprivileged) +defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">; +defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">; + +defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">; +defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">; + +// load sign-extended half-word +defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">; +defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">; + +// load sign-extended byte +defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">; +defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">; + +// load sign-extended word +defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">; + +//--- +// (immediate pre-indexed) +def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">; +def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">; +def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">; +def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">; +def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">; +def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">; +def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">; + +// load sign-extended half-word +def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">; +def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">; + +// load sign-extended byte +def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">; +def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">; + +// load zero-extended byte +def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">; +def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">; + +// load sign-extended word +def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">; + +//--- +// (immediate post-indexed) +def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">; +def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">; +def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">; +def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">; +def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">; +def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">; +def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">; + +// load sign-extended half-word +def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">; +def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">; + +// load sign-extended byte +def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">; +def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">; + +// load zero-extended byte +def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">; +def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">; + +// load sign-extended word +def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">; + +//===----------------------------------------------------------------------===// +// Store instructions. +//===----------------------------------------------------------------------===// + +// Pair (indexed, offset) +// FIXME: Use dedicated range-checked addressing mode operand here. +defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">; +defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">; +defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">; +defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">; +defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (pre-indexed) +def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">; +def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">; +def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">; +def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">; +def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (pre-indexed) +def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">; +def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">; +def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">; +def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">; +def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (no allocate) +defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">; +defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">; +defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">; +defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">; +defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">; + +//--- +// (Register offset) + +// Integer +defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>; +defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>; +defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>; +defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>; + + +// Floating-point +defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>; +defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>; +defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>; +defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; +defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; + +multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop, + Instruction STRW, Instruction STRX> { + + def : Pat<(storeop GPR64:$Rt, + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop GPR64:$Rt, + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { + // truncstore i64 + defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>; + defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>; + defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>; +} + +multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR, + Instruction STRW, Instruction STRX> { + def : Pat<(store (VecTy FPR:$Rt), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(store (VecTy FPR:$Rt), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { +// Match all store 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>; +} + +defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>; +defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>; +} +} // AddedComplexity = 10 + +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop, + ValueType VecTy, ValueType STy, + SubRegIndex SubRegIdx, + Instruction STRW, Instruction STRX> { + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 19 in { + defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>; + defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>; +} + +//--- +// (unsigned immediate) +defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", + [(store GPR64:$Rt, + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; +defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str", + [(store GPR32:$Rt, + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; +defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str", + [(store FPR8:$Rt, + (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str", + [(store (f16 FPR16:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>; +defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str", + [(store (f32 FPR32:$Rt), + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; +defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str", + [(store (f64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; +defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>; + +defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh", + [(truncstorei16 GPR32:$Rt, + (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset))]>; +defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb", + [(truncstorei8 GPR32:$Rt, + (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset))]>; + +// Match all store 64 bits width whose type is compatible with FPR64 +let AddedComplexity = 10 in { +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v2f32 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v8i8 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4i16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v2i32 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +} +def : Pat<(store (v1f64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(store (v1i64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v4f32 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v16i8 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8i16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v4i32 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v2i64 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; +} +def : Pat<(store (f128 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + +// truncstore i64 +def : Pat<(truncstorei32 GPR64:$Rt, + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)), + (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(truncstorei16 GPR64:$Rt, + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), + (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>; + +} // AddedComplexity = 10 + +//--- +// (unscaled immediate) +defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur", + [(store GPR64:$Rt, + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; +defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur", + [(store GPR32:$Rt, + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; +defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur", + [(store FPR8:$Rt, + (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; +defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur", + [(store (f16 FPR16:$Rt), + (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; +defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur", + [(store (f32 FPR32:$Rt), + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; +defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur", + [(store (f64 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; +defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur", + [(store (f128 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>; +defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh", + [(truncstorei16 GPR32:$Rt, + (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; +defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb", + [(truncstorei8 GPR32:$Rt, + (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; + +// Match all store 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v2f32 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8i8 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4i16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2i32 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; +} +def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v4f32 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v16i8 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8i16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4i32 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2i64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; +} + +// unscaled i64 truncating stores +def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), + (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; + +//--- +// STR mnemonics fall back to STUR for negative or unaligned offsets. +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; + +def : InstAlias<"strb $Rt, [$Rn, $offset]", + (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"strh $Rt, [$Rn, $offset]", + (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; + +//--- +// (unscaled immediate, unprivileged) +defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">; +defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">; + +defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">; +defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">; + +//--- +// (immediate pre-indexed) +def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>; +def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>; +def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>; +def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>; +def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>; +def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>; +def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>; + +def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>; +def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>; + +// truncstore i64 +def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; + +def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; + +def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; + +//--- +// (immediate post-indexed) +def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>; +def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>; +def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>; +def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>; +def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>; +def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>; +def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>; + +def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>; +def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>; + +// truncstore i64 +def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; + +def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; + +def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; + +//===----------------------------------------------------------------------===// +// Load/store exclusive instructions. +//===----------------------------------------------------------------------===// + +def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">; +def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">; +def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">; +def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">; + +def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">; +def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">; +def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">; +def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">; + +def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">; +def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">; +def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">; +def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">; + +def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">; +def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">; +def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">; +def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">; + +def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">; +def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">; +def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">; +def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">; + +def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">; +def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">; +def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">; +def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">; + +def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">; +def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">; + +def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">; +def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">; + +def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">; +def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">; + +def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">; +def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">; + +let Predicates = [HasV8_1a] in { + // v8.1a "Limited Order Region" extension load-acquire instructions + def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">; + def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">; + def LDLARB : LoadAcquire <0b00, 1, 1, 0, 0, GPR32, "ldlarb">; + def LDLARH : LoadAcquire <0b01, 1, 1, 0, 0, GPR32, "ldlarh">; + + // v8.1a "Limited Order Region" extension store-release instructions + def STLLRW : StoreRelease <0b10, 1, 0, 0, 0, GPR32, "stllr">; + def STLLRX : StoreRelease <0b11, 1, 0, 0, 0, GPR64, "stllr">; + def STLLRB : StoreRelease <0b00, 1, 0, 0, 0, GPR32, "stllrb">; + def STLLRH : StoreRelease <0b01, 1, 0, 0, 0, GPR32, "stllrh">; +} + +//===----------------------------------------------------------------------===// +// Scaled floating point to integer conversion instructions. +//===----------------------------------------------------------------------===// + +defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>; +defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>; +defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>; +defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>; +defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>; +defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>; +defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>; +defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>; +defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; +defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; +defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +let isCodeGenOnly = 1 in { +defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; +defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; +} + +multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">; + +//===----------------------------------------------------------------------===// +// Scaled integer to floating point conversion instructions. +//===----------------------------------------------------------------------===// + +defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; +defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; + +//===----------------------------------------------------------------------===// +// Unscaled integer to floating point conversion instruction. +//===----------------------------------------------------------------------===// + +defm FMOV : UnscaledConversion<"fmov">; + +// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable +let isReMaterializable = 1, isCodeGenOnly = 1 in { +def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, + PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, + Requires<[NoZCZ]>; +def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, + PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, + Requires<[NoZCZ]>; +} + +//===----------------------------------------------------------------------===// +// Floating point conversion instruction. +//===----------------------------------------------------------------------===// + +defm FCVT : FPConversion<"fcvt">; + +//===----------------------------------------------------------------------===// +// Floating point single operand instructions. +//===----------------------------------------------------------------------===// + +defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; +defm FMOV : SingleOperandFPData<0b0000, "fmov">; +defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; +defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>; +defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; +defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; +defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>; +defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; + +def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), + (FRINTNDr FPR64:$Rn)>; + +defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; +defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; + +let SchedRW = [WriteFDiv] in { +defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>; +} + +//===----------------------------------------------------------------------===// +// Floating point two operand instructions. +//===----------------------------------------------------------------------===// + +defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; +let SchedRW = [WriteFDiv] in { +defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; +} +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>; +let SchedRW = [WriteFMul] in { +defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; +defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; +} +defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; + +def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMINDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; + +//===----------------------------------------------------------------------===// +// Floating point three operand instructions. +//===----------------------------------------------------------------------===// + +defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>; +defm FMSUB : ThreeOperandFPData<0, 1, "fmsub", + TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >; +defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd", + TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >; +defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", + TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; + +// The following def pats catch the case where the LHS of an FMA is negated. +// The TriOpFrag above catches the case where the middle operand is negated. + +// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike +// the NEON variant. +def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), + (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), + (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and +// "(-a) + b*(-c)". +def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), + (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), + (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), + (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), + (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +//===----------------------------------------------------------------------===// +// Floating point comparison instructions. +//===----------------------------------------------------------------------===// + +defm FCMPE : FPComparison<1, "fcmpe">; +defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; + +//===----------------------------------------------------------------------===// +// Floating point conditional comparison instructions. +//===----------------------------------------------------------------------===// + +defm FCCMPE : FPCondComparison<1, "fccmpe">; +defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>; + +//===----------------------------------------------------------------------===// +// Floating point conditional select instruction. +//===----------------------------------------------------------------------===// + +defm FCSEL : FPCondSelect<"fcsel">; + +// CSEL instructions providing f128 types need to be handled by a +// pseudo-instruction since the eventual code will need to introduce basic +// blocks and control flow. +def F128CSEL : Pseudo<(outs FPR128:$Rd), + (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond), + [(set (f128 FPR128:$Rd), + (AArch64csel FPR128:$Rn, FPR128:$Rm, + (i32 imm:$cond), NZCV))]> { + let Uses = [NZCV]; + let usesCustomInserter = 1; +} + + +//===----------------------------------------------------------------------===// +// Floating point immediate move. +//===----------------------------------------------------------------------===// + +let isReMaterializable = 1 in { +defm FMOV : FPMoveImmediate<"fmov">; +} + +//===----------------------------------------------------------------------===// +// Advanced SIMD two vector instructions. +//===----------------------------------------------------------------------===// + +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + int_aarch64_neon_uabd>; +// Match UABDL in log2-shuffle patterns. +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; + +defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; +def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), + (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), + (ABSv8i8 V64:$src)>; +def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))), + (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))), + (ABSv4i16 V64:$src)>; +def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))), + (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))), + (ABSv2i32 V64:$src)>; +def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))), + (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))), + (ABSv16i8 V128:$src)>; +def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))), + (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))), + (ABSv8i16 V128:$src)>; +def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))), + (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))), + (ABSv4i32 V128:$src)>; +def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))), + (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))), + (ABSv2i64 V128:$src)>; + +defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; +defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; +defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>; +defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>; +defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>; +defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>; +defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>; +defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; +defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; + +defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>; +defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>; +defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">; +def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), + (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), + (i64 4)))), + (FCVTLv8i16 V128:$Rn)>; +def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; +def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), + (i64 2))))), + (FCVTLv4i32 V128:$Rn)>; + +def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn), + (i64 4))))), + (FCVTLv8i16 V128:$Rn)>; + +defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; +defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; +defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>; +defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>; +defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">; +def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))), + (FCVTNv4i16 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, + (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), + (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), + (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; +defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; +defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", + int_aarch64_neon_fcvtxn>; +defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; +defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; +let isCodeGenOnly = 1 in { +defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", + int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", + int_aarch64_neon_fcvtzu>; +} +defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; +defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; +defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; +defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; +defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; +defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>; +defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>; +defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>; +defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>; +defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>; +defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>; +defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg", + UnOpFrag<(sub immAllZerosV, node:$LHS)> >; +defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>; +// Aliases for MVN -> NOT. +def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}", + (NOTv8i8 V64:$Vd, V64:$Vn)>; +def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}", + (NOTv16i8 V128:$Vd, V128:$Vn)>; + +def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>; +def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>; +def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>; +def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>; +def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>; +def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>; +def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>; + +def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; + +def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; + +defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>; +defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; +defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; +defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; +defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp", + BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >; +defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>; +defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>; +defm SHLL : SIMDVectorLShiftLongBySizeBHS; +defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; +defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; +defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>; +defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>; +defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>; +defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", + BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >; +defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", + int_aarch64_neon_uaddlp>; +defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>; +defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; +defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; +defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>; +defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; +defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; + +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; + +// Patterns for vector long shift (by element width). These need to match all +// three of zext, sext and anyext so it's easier to pull the patterns out of the +// definition. +multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> { + def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)), + (SHLLv8i8 V64:$Rn)>; + def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)), + (SHLLv16i8 V128:$Rn)>; + def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)), + (SHLLv4i16 V64:$Rn)>; + def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)), + (SHLLv8i16 V128:$Rn)>; + def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)), + (SHLLv2i32 V64:$Rn)>; + def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)), + (SHLLv4i32 V128:$Rn)>; +} + +defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>; +defm : SIMDVectorLShiftLongBySizeBHSPats<zext>; +defm : SIMDVectorLShiftLongBySizeBHSPats<sext>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three vector instructions. +//===----------------------------------------------------------------------===// + +defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; +defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>; +defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>; +defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>; +defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; +defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; +defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; +defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; +defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; +defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; +defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; +defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; +defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; +defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>; +defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; +defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>; + +// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the +// instruction expects the addend first, while the fma intrinsic puts it last. +defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; +defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", + TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; + +// The following def pats catch the case where the LHS of an FMA is negated. +// The TriOpFrag above catches the case where the middle operand is negated. +def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)), + (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), + (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), + (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; + +defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; +defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; +defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", + TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; +defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", + TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; +defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; +defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; +defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; +defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; +defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; +defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; +defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; +defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>; +defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; +defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; +defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; +defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; +defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; +defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; +defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; +defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; +defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; +defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; +defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; +defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; +defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>; +defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; +defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; +defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; +defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; +defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; +defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", + int_aarch64_neon_sqadd>; +defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", + int_aarch64_neon_sqsub>; + +defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; +defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", + BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; +defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", + TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; +defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; +defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", + BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; +defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; + + +def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; +def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; +def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; +def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; + +def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>; +def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; +def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; +def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; + +def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" # + "|cmls.8b\t$dst, $src1, $src2}", + (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" # + "|cmls.16b\t$dst, $src1, $src2}", + (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" # + "|cmls.4h\t$dst, $src1, $src2}", + (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" # + "|cmls.8h\t$dst, $src1, $src2}", + (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" # + "|cmls.2s\t$dst, $src1, $src2}", + (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" # + "|cmls.4s\t$dst, $src1, $src2}", + (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" # + "|cmls.2d\t$dst, $src1, $src2}", + (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" # + "|cmlo.8b\t$dst, $src1, $src2}", + (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" # + "|cmlo.16b\t$dst, $src1, $src2}", + (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" # + "|cmlo.4h\t$dst, $src1, $src2}", + (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" # + "|cmlo.8h\t$dst, $src1, $src2}", + (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" # + "|cmlo.2s\t$dst, $src1, $src2}", + (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" # + "|cmlo.4s\t$dst, $src1, $src2}", + (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" # + "|cmlo.2d\t$dst, $src1, $src2}", + (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" # + "|cmle.8b\t$dst, $src1, $src2}", + (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" # + "|cmle.16b\t$dst, $src1, $src2}", + (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" # + "|cmle.4h\t$dst, $src1, $src2}", + (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" # + "|cmle.8h\t$dst, $src1, $src2}", + (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" # + "|cmle.2s\t$dst, $src1, $src2}", + (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" # + "|cmle.4s\t$dst, $src1, $src2}", + (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" # + "|cmle.2d\t$dst, $src1, $src2}", + (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" # + "|cmlt.8b\t$dst, $src1, $src2}", + (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" # + "|cmlt.16b\t$dst, $src1, $src2}", + (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|cmlt.4h\t$dst, $src1, $src2}", + (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|cmlt.8h\t$dst, $src1, $src2}", + (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" # + "|cmlt.2s\t$dst, $src1, $src2}", + (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" # + "|cmlt.4s\t$dst, $src1, $src2}", + (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # + "|cmlt.2d\t$dst, $src1, $src2}", + (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmle.4h\t$dst, $src1, $src2}", + (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmle.8h\t$dst, $src1, $src2}", + (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} +def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # + "|fcmle.2s\t$dst, $src1, $src2}", + (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" # + "|fcmle.4s\t$dst, $src1, $src2}", + (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # + "|fcmle.2d\t$dst, $src1, $src2}", + (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmlt.4h\t$dst, $src1, $src2}", + (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmlt.8h\t$dst, $src1, $src2}", + (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} +def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # + "|fcmlt.2s\t$dst, $src1, $src2}", + (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" # + "|fcmlt.4s\t$dst, $src1, $src2}", + (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # + "|fcmlt.2d\t$dst, $src1, $src2}", + (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" # + "|facle.4h\t$dst, $src1, $src2}", + (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" # + "|facle.8h\t$dst, $src1, $src2}", + (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} +def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # + "|facle.2s\t$dst, $src1, $src2}", + (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" # + "|facle.4s\t$dst, $src1, $src2}", + (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # + "|facle.2d\t$dst, $src1, $src2}", + (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" # + "|faclt.4h\t$dst, $src1, $src2}", + (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" # + "|faclt.8h\t$dst, $src1, $src2}", + (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} +def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # + "|faclt.2s\t$dst, $src1, $src2}", + (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" # + "|faclt.4s\t$dst, $src1, $src2}", + (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" # + "|faclt.2d\t$dst, $src1, $src2}", + (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three scalar instructions. +//===----------------------------------------------------------------------===// + +defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>; +defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>; +defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>; +defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>; +defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>; +defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>; +defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; +defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>; +def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FABD64 FPR64:$Rn, FPR64:$Rm)>; +defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge", + int_aarch64_neon_facge>; +defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", + int_aarch64_neon_facgt>; +defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>; +defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; +defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>; +defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>; +defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>; +defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>; +defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>; +defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>; +defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>; +defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>; +defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>; +defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; +defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; +defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; +let Predicates = [HasV8_1a] in { + defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; + defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; + def : Pat<(i32 (int_aarch64_neon_sqadd + (i32 FPR32:$Rd), + (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(i32 (int_aarch64_neon_sqsub + (i32 FPR32:$Rd), + (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; +} + +def : InstAlias<"cmls $dst, $src1, $src2", + (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmle $dst, $src1, $src2", + (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmlo $dst, $src1, $src2", + (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmlt $dst, $src1, $src2", + (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"fcmle $dst, $src1, $src2", + (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"fcmle $dst, $src1, $src2", + (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"fcmlt $dst, $src1, $src2", + (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"fcmlt $dst, $src1, $src2", + (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"facle $dst, $src1, $src2", + (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"facle $dst, $src1, $src2", + (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"faclt $dst, $src1, $src2", + (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"faclt $dst, $src1, $src2", + (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three scalar instructions (mixed operands). +//===----------------------------------------------------------------------===// +defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull", + int_aarch64_neon_sqdmulls_scalar>; +defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">; +defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">; + +def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD two scalar instructions. +//===----------------------------------------------------------------------===// + +defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>; +defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>; +defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; +defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; +defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>; +defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>; +defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; +defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">; +defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">; +defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">; +defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">; +defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">; +defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">; +defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; +def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; +defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; +defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; +defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", + UnOpFrag<(sub immAllZerosV, node:$LHS)> >; +defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; +defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; +defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; +defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>; +defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>; +defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", + int_aarch64_neon_suqadd>; +defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>; +defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>; +defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", + int_aarch64_neon_usqadd>; + +def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>; + +def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))), + (FCVTASv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))), + (FCVTAUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))), + (FCVTMSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))), + (FCVTMUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))), + (FCVTNSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))), + (FCVTNUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))), + (FCVTPSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), + (FCVTPUv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), + (FRECPXv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), + (FRECPXv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; + +// If an integer is about to be converted to a floating point value, +// just load it on the floating point unit. +// Here are the patterns for 8 and 16-bits to float. +// 8-bits -> float. +multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy, + SDPatternOperator loadop, Instruction UCVTF, + ROAddrMode ro, Instruction LDRW, Instruction LDRX, + SubRegIndex sub> { + def : Pat<(DstTy (uint_to_fp (SrcTy + (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, + ro.Wext:$extend))))), + (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), + (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), + sub))>; + + def : Pat<(DstTy (uint_to_fp (SrcTy + (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, + ro.Wext:$extend))))), + (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), + (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), + sub))>; +} + +defm : UIntToFPROLoadPat<f32, i32, zextloadi8, + UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; +// 16-bits -> float. +defm : UIntToFPROLoadPat<f32, i32, zextloadi16, + UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; +// 32-bits are handled in target specific dag combine: +// performIntToFpCombine. +// 64-bits integer to 32-bits floating point, not possible with +// UCVTF on floating point registers (both source and destination +// must have the same size). + +// Here are the patterns for 8, 16, 32, and 64-bits to double. +// 8-bits -> double. +defm : UIntToFPROLoadPat<f64, i32, zextloadi8, + UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; +// 16-bits -> double. +defm : UIntToFPROLoadPat<f64, i32, zextloadi16, + UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; +// 32-bits -> double. +defm : UIntToFPROLoadPat<f64, i32, load, + UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>; +def : Pat <(f64 (uint_to_fp (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>; +def : Pat <(f64 (uint_to_fp (i32 + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; +// 64-bits -> double are handled in target specific dag combine: +// performIntToFpCombine. + +//===----------------------------------------------------------------------===// +// Advanced SIMD three different-sized vector instructions. +//===----------------------------------------------------------------------===// + +defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; +defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; +defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; +defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>; +defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", + int_aarch64_neon_sabd>; +defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", + int_aarch64_neon_sabd>; +defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", + BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; +defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", + BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; +defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>; +defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", + int_aarch64_neon_sqadd>; +defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", + int_aarch64_neon_sqsub>; +defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", + int_aarch64_neon_sqdmull>; +defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", + BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; +defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", + BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; +defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", + int_aarch64_neon_uabd>; +defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", + BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; +defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", + BinOpFrag<(add node:$LHS, (zext node:$RHS))>>; +defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; +defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", + BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>; +defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", + BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; + +// Additional patterns for SMULL and UMULL +multiclass Neon_mul_widen_patterns<SDPatternOperator opnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16, + SMULLv4i16_v4i32, SMULLv2i32_v2i64>; +defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16, + UMULLv4i16_v4i32, UMULLv2i32_v2i64>; + +// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL +multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; + +// Patterns for 64-bit pmull +def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), + (PMULLv1i64 V64:$Rn, V64:$Rm)>; +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + (extractelt (v2i64 V128:$Rm), (i64 1))), + (PMULLv2i64 V128:$Rn, V128:$Rm)>; + +// CodeGen patterns for addhn and subhn instructions, which can actually be +// written in LLVM IR without too much difficulty. + +// ADDHN +def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), + (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 16))))), + (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 32))))), + (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v8i8 V64:$Rd), + (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 8))))), + (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v4i16 V64:$Rd), + (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 16))))), + (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v2i32 V64:$Rd), + (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 32))))), + (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + +// SUBHN +def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))), + (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 16))))), + (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 32))))), + (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v8i8 V64:$Rd), + (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 8))))), + (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v4i16 V64:$Rd), + (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 16))))), + (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v2i32 V64:$Rd), + (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 32))))), + (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + +//---------------------------------------------------------------------------- +// AdvSIMD bitwise extract from vector instruction. +//---------------------------------------------------------------------------- + +defm EXT : SIMDBitwiseExtract<"ext">; + +def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; + +// We use EXT to handle extract_subvector to copy the upper 64-bits of a +// 128-bit vector. +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; + + +//---------------------------------------------------------------------------- +// AdvSIMD zip vector +//---------------------------------------------------------------------------- + +defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>; +defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>; +defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>; +defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>; +defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>; +defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>; + +//---------------------------------------------------------------------------- +// AdvSIMD TBL/TBX instructions +//---------------------------------------------------------------------------- + +defm TBL : SIMDTableLookup< 0, "tbl">; +defm TBX : SIMDTableLookupTied<1, "tbx">; + +def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), + (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>; +def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))), + (TBLv16i8One V128:$Ri, V128:$Rn)>; + +def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd), + (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), + (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>; +def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), + (v16i8 V128:$Ri), (v16i8 V128:$Rn))), + (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>; + + +//---------------------------------------------------------------------------- +// AdvSIMD scalar CPY instruction +//---------------------------------------------------------------------------- + +defm CPY : SIMDScalarCPY<"cpy">; + +//---------------------------------------------------------------------------- +// AdvSIMD scalar pairwise instructions +//---------------------------------------------------------------------------- + +defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; +defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">; +defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; +defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; +defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; +defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; +def : Pat<(v2i64 (AArch64saddv V128:$Rn)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; +def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; +def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))), + (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; +def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))), + (FMAXNMPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))), + (FMAXNMPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))), + (FMAXPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))), + (FMAXPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))), + (FMINNMPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))), + (FMINNMPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))), + (FMINPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))), + (FMINPv2i64p V128:$Rn)>; + +//---------------------------------------------------------------------------- +// AdvSIMD INS/DUP instructions +//---------------------------------------------------------------------------- + +def DUPv8i8gpr : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>; +def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>; +def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>; +def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>; +def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>; +def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>; +def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>; + +def DUPv2i64lane : SIMDDup64FromElement; +def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>; +def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>; +def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>; +def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>; +def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; +def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; + +def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))), + (v2f32 (DUPv2i32lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), + (i64 0)))>; +def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))), + (v4f32 (DUPv4i32lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), + (i64 0)))>; +def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))), + (v2f64 (DUPv2i64lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub), + (i64 0)))>; +def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), + (v4f16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; +def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), + (v8f16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; + +def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; + +def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), + (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; +def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), + (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>; +def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)), + (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>; + +// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane +// instruction even if the types don't match: we just have to remap the lane +// carefully. N.b. this trick only applies to truncations. +def VecIndex_x2 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; +def VecIndex_x4 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; +def VecIndex_x8 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; + +multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT, + ValueType Src128VT, ValueType ScalVT, + Instruction DUP, SDNodeXForm IdxXFORM> { + def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn), + imm:$idx)))), + (DUP V128:$Rn, (IdxXFORM imm:$idx))>; + + def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn), + imm:$idx)))), + (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; +} + +defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>; +defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>; +defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>; + +defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>; +defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>; +defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>; + +multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP, + SDNodeXForm IdxXFORM> { + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn), + imm:$idx))))), + (DUP V128:$Rn, (IdxXFORM imm:$idx))>; + + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn), + imm:$idx))))), + (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; +} + +defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>; +defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>; +defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>; + +defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>; +defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>; +defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>; + +// SMOV and UMOV definitions, with some extra patterns for convenience +defm SMOV : SMov; +defm UMOV : UMov; + +def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), + (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), + (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; + +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn), + VectorIndexB:$idx)))), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), + VectorIndexH:$idx)))), i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; + +// Extracting i8 or i16 elements will have the zero-extend transformed to +// an 'and' mask by type legalization since neither i8 nor i16 are legal types +// for AArch64. Match these patterns here since UMOV already zeroes out the high +// bits of the destination register. +def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), + (i32 0xff)), + (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx), + (i32 0xffff)), + (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>; + +defm INS : SIMDIns; + +def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; +def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; + +def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; +def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; + +def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), + (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + (i32 FPR32:$Rn), ssub))>; +def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), + (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (i32 FPR32:$Rn), ssub))>; +def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (i64 FPR64:$Rn), dsub))>; + +def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + +def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; +def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; + +def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + +def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), + (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi32lane + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), + (i64 0)), + dsub)>; +def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn), + (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), + (INSvi32lane + V128:$Rn, VectorIndexS:$imm, + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), + (i64 0))>; +def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn), + (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))), + (INSvi64lane + V128:$Rn, VectorIndexD:$imm, + (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)), + (i64 0))>; + +// Copy an element at a constant index in one vector into a constant indexed +// element of another. +// FIXME refactor to a shared class/dev parameterized on vector type, vector +// index type and INS extension +def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane + (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs), + VectorIndexB:$idx2)), + (v16i8 (INSvi8lane + V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2) + )>; +def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane + (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs), + VectorIndexH:$idx2)), + (v8i16 (INSvi16lane + V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2) + )>; +def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane + (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs), + VectorIndexS:$idx2)), + (v4i32 (INSvi32lane + V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2) + )>; +def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane + (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs), + VectorIndexD:$idx2)), + (v2i64 (INSvi64lane + V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) + )>; + +multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, + ValueType VTScal, Instruction INS> { + def : Pat<(VT128 (vector_insert V128:$src, + (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), + imm:$Immd)), + (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>; + + def : Pat<(VT128 (vector_insert V128:$src, + (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), + imm:$Immd)), + (INS V128:$src, imm:$Immd, + (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>; + + def : Pat<(VT64 (vector_insert V64:$src, + (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), + imm:$Immd)), + (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), + imm:$Immd, V128:$Rn, imm:$Immn), + dsub)>; + + def : Pat<(VT64 (vector_insert V64:$src, + (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), + imm:$Immd)), + (EXTRACT_SUBREG + (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd, + (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn), + dsub)>; +} + +defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>; +defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>; +defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; + + +// Floating point vector extractions are codegen'd as either a sequence of +// subregister extractions, or a MOV (aka CPY here, alias for DUP) if +// the lane number is anything other than zero. +def : Pat<(vector_extract (v2f64 V128:$Rn), 0), + (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; +def : Pat<(vector_extract (v4f32 V128:$Rn), 0), + (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; +def : Pat<(vector_extract (v8f16 V128:$Rn), 0), + (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + +def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), + (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; +def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), + (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; +def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), + (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; + +// All concat_vectors operations are canonicalised to act on i64 vectors for +// AArch64. In the general case we need an instruction, which had just as well be +// INS. +class ConcatPat<ValueType DstTy, ValueType SrcTy> + : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1, + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>; + +def : ConcatPat<v2i64, v1i64>; +def : ConcatPat<v2f64, v1f64>; +def : ConcatPat<v4i32, v2i32>; +def : ConcatPat<v4f32, v2f32>; +def : ConcatPat<v8i16, v4i16>; +def : ConcatPat<v8f16, v4f16>; +def : ConcatPat<v16i8, v8i8>; + +// If the high lanes are undef, though, we can just ignore them: +class ConcatUndefPat<ValueType DstTy, ValueType SrcTy> + : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>; + +def : ConcatUndefPat<v2i64, v1i64>; +def : ConcatUndefPat<v2f64, v1f64>; +def : ConcatUndefPat<v4i32, v2i32>; +def : ConcatUndefPat<v4f32, v2f32>; +def : ConcatUndefPat<v8i16, v4i16>; +def : ConcatUndefPat<v16i8, v8i8>; + +//---------------------------------------------------------------------------- +// AdvSIMD across lanes instructions +//---------------------------------------------------------------------------- + +defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">; +defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">; +defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">; +defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; +defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; +defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; +defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; +defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; +defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; +defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; +defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; + +// Patterns for across-vector intrinsics, that have a node equivalent, that +// returns a vector (with only the low lane defined) instead of a scalar. +// In effect, opNode is the same as (scalar_to_vector (IntNode)). +multiclass SIMDAcrossLanesIntrinsic<string baseOpc, + SDPatternOperator opNode> { +// If a lane instruction caught the vector_extract around opNode, we can +// directly match the latter to the instruction. +def : Pat<(v8i8 (opNode V64:$Rn)), + (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>; +def : Pat<(v16i8 (opNode V128:$Rn)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>; +def : Pat<(v4i16 (opNode V64:$Rn)), + (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>; +def : Pat<(v8i16 (opNode V128:$Rn)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>; +def : Pat<(v4i32 (opNode V128:$Rn)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>; + + +// If none did, fallback to the explicit patterns, consuming the vector_extract. +def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)), + (i32 0)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), + bsub), ssub)>; +def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), + bsub), ssub)>; +def : Pat<(i32 (vector_extract (insert_subvector undef, + (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), + hsub), ssub)>; +def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), + hsub), ssub)>; +def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), + ssub), ssub)>; + +} + +multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, + SDPatternOperator opNode> + : SIMDAcrossLanesIntrinsic<baseOpc, opNode> { +// If there is a sign extension after this intrinsic, consume it as smov already +// performed it +def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract + (opNode (v16i8 V128:$Rn)), (i64 0))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract + (opNode (v8i16 V128:$Rn)), (i64 0))), i16)), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + (i64 0)))>; +} + +multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, + SDPatternOperator opNode> + : SIMDAcrossLanesIntrinsic<baseOpc, opNode> { +// If there is a masking operation keeping only what has been actually +// generated, consume it. +def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub))>; +def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))), + maski8_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + ssub))>; +def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + ssub))>; +def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))), + maski16_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + ssub))>; +} + +defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))), + (ADDPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))), + (ADDPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>; +def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))), + (SMAXPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>; +def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))), + (SMINPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>; +def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))), + (UMAXPv2i32 V64:$Rn, V64:$Rn)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>; +def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))), + (UMINPv2i32 V64:$Rn, V64:$Rn)>; + +multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), + (i64 0)))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), + (i64 0)))>; + +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), + ssub))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), + ssub))>; + +def : Pat<(i64 (intOp (v4i32 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), + dsub))>; +} + +multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc, + Intrinsic intOp> { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), + ssub))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), + ssub))>; + +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), + ssub))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), + ssub))>; + +def : Pat<(i64 (intOp (v4i32 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), + dsub))>; +} + +defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; +defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>; + +// The vaddlv_s32 intrinsic gets mapped to SADDLP. +def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (SADDLPv2i32_v1i64 V64:$Rn), dsub), + dsub))>; +// The vaddlv_u32 intrinsic gets mapped to UADDLP. +def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (UADDLPv2i32_v1i64 V64:$Rn), dsub), + dsub))>; + +//------------------------------------------------------------------------------ +// AdvSIMD modified immediate instructions +//------------------------------------------------------------------------------ + +// AdvSIMD BIC +defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>; +// AdvSIMD ORR +defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>; + +def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; + +def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; + +def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +// AdvSIMD FMOV +def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, + "fmov", ".2d", + [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8, + "fmov", ".2s", + [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8, + "fmov", ".4s", + [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +let Predicates = [HasNEON, HasFullFP16] in { +def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8, + "fmov", ".4h", + [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8, + "fmov", ".8h", + [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +} // Predicates = [HasNEON, HasFullFP16] + +// AdvSIMD MOVI + +// EDIT byte mask: scalar +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi", + [(set FPR64:$Rd, simdimmtype10:$imm8)]>; +// The movi_edit node has the immediate value already encoded, so we use +// a plain imm0_255 here. +def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)), + (MOVID imm0_255:$shift)>; + +def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>; + +def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; + +// EDIT byte mask: 2d + +// The movi_edit node has the immediate value already encoded, so we use +// a plain imm0_255 in the pattern +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, + simdimmtype10, + "movi", ".2d", + [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; + + +// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. +// Complexity is added to break a tie with a plain MOVI. +let AddedComplexity = 1 in { +def : Pat<(f32 fpimm0), + (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, + Requires<[HasZCZ]>; +def : Pat<(f64 fpimm0), + (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, + Requires<[HasZCZ]>; +} + +def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>; + +def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; + +def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>; + +// EDIT per word & halfword: 2s, 4h, 4s, & 8h +defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; + +def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv2i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv4i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv4i16 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv8i16 imm0_255:$imm8, imm:$shift)>; + +// EDIT per word: 2s & 4s with MSL shifter +def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s", + [(set (v2i32 V64:$Rd), + (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; +def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", + [(set (v4i32 V128:$Rd), + (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; + +// Per byte: 8b & 16b +def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255, + "movi", ".8b", + [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; +def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255, + "movi", ".16b", + [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; + +// AdvSIMD MVNI + +// EDIT per word & halfword: 2s, 4h, 4s, & 8h +defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">; + +def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv2i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv4i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv4i16 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv8i16 imm0_255:$imm8, imm:$shift)>; + +// EDIT per word: 2s & 4s with MSL shifter +def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s", + [(set (v2i32 V64:$Rd), + (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; +def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", + [(set (v4i32 V128:$Rd), + (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; + +//---------------------------------------------------------------------------- +// AdvSIMD indexed element +//---------------------------------------------------------------------------- + +let hasSideEffects = 0 in { + defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">; + defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">; +} + +// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the +// instruction expects the addend first, while the intrinsic expects it last. + +// On the other hand, there are quite a few valid combinatorial options due to +// the commutativity of multiplication and the fact that (-x) * y = x * (-y). +defm : SIMDFPIndexedTiedPatterns<"FMLA", + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; +defm : SIMDFPIndexedTiedPatterns<"FMLA", + TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; + +defm : SIMDFPIndexedTiedPatterns<"FMLS", + TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; +defm : SIMDFPIndexedTiedPatterns<"FMLS", + TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; +defm : SIMDFPIndexedTiedPatterns<"FMLS", + TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; +defm : SIMDFPIndexedTiedPatterns<"FMLS", + TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; + +multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { + // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit + // and DUP scalar. + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64duplane32 (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (v2f32 (AArch64duplane32 + (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), + VectorIndexS:$idx)))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64dup (f32 (fneg FPR32Op:$Rm))))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit + // and DUP scalar. + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64duplane32 (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm, + VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (v4f32 (AArch64duplane32 + (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), + VectorIndexS:$idx)))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64dup (f32 (fneg FPR32Op:$Rm))))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar + // (DUPLANE from 64-bit would be trivial). + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64duplane64 (v2f64 (fneg V128:$Rm)), + VectorIndexD:$idx))), + (FMLSv2i64_indexed + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64dup (f64 (fneg FPR64Op:$Rm))))), + (FMLSv2i64_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; + + // 2 variants for 32-bit scalar version: extract from .2s or from .4s + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, + V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), + VectorIndexS:$idx))), + (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; + + // 1 variant for 64-bit scalar version: extract from .1d or from .2d + def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), + (vector_extract (v2f64 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn, + V128:$Rm, VectorIndexS:$idx)>; +} + +defm : FMLSIndexedAfterNegPatterns< + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; +defm : FMLSIndexedAfterNegPatterns< + TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; + +defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>; + +def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), + (FMULv2i32_indexed V64:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), + (i64 0))>; +def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), + (FMULv4i32_indexed V128:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), + (i64 0))>; +def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), + (FMULv2i64_indexed V128:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub), + (i64 0))>; + +defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", + TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>; +defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", + TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>; +defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; +defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", + int_aarch64_neon_smull>; +defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", + int_aarch64_neon_sqadd>; +defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", + int_aarch64_neon_sqsub>; +defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", + int_aarch64_neon_sqadd>; +defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", + int_aarch64_neon_sqsub>; +defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; +defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", + int_aarch64_neon_umull>; + +// A scalar sqdmull with the second operand being a vector lane can be +// handled directly with the indexed instruction encoding. +def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (vector_extract (v4i32 V128:$Vm), + VectorIndexS:$idx)), + (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; + +//---------------------------------------------------------------------------- +// AdvSIMD scalar shift instructions +//---------------------------------------------------------------------------- +defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">; +defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">; +defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">; +defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">; +// Codegen patterns for the above. We don't put these directly on the +// instructions because TableGen's type inference can't handle the truth. +// Having the same base pattern for fp <--> int totally freaks it out. +def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm), + (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm), + (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)), + (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)), + (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn), + vecshiftR64:$imm)), + (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn), + vecshiftR64:$imm)), + (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm), + (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm), + (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), + (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), + (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn), + vecshiftR64:$imm)), + (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn), + vecshiftR64:$imm)), + (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; + +defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>; +defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">; +defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn", + int_aarch64_neon_sqrshrn>; +defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun", + int_aarch64_neon_sqrshrun>; +defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; +defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; +defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn", + int_aarch64_neon_sqshrn>; +defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun", + int_aarch64_neon_sqshrun>; +defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">; +defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>; +defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra", + TriOpFrag<(add node:$LHS, + (AArch64srshri node:$MHS, node:$RHS))>>; +defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; +defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", + TriOpFrag<(add node:$LHS, + (AArch64vashr node:$MHS, node:$RHS))>>; +defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", + int_aarch64_neon_uqrshrn>; +defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; +defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn", + int_aarch64_neon_uqshrn>; +defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>; +defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra", + TriOpFrag<(add node:$LHS, + (AArch64urshri node:$MHS, node:$RHS))>>; +defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; +defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", + TriOpFrag<(add node:$LHS, + (AArch64vlshr node:$MHS, node:$RHS))>>; + +//---------------------------------------------------------------------------- +// AdvSIMD vector shift instructions +//---------------------------------------------------------------------------- +defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>; +defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>; +defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", + int_aarch64_neon_vcvtfxs2fp>; +defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", + int_aarch64_neon_rshrn>; +defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; +defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", + BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; +def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftL64:$imm))), + (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; +defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", + int_aarch64_neon_sqrshrn>; +defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", + int_aarch64_neon_sqrshrun>; +defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; +defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; +defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", + int_aarch64_neon_sqshrn>; +defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", + int_aarch64_neon_sqshrun>; +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; +def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftR64:$imm))), + (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; +defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; +defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra", + TriOpFrag<(add node:$LHS, + (AArch64srshri node:$MHS, node:$RHS))> >; +defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", + BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>; + +defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; +defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", + TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; +defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", + int_aarch64_neon_vcvtfxu2fp>; +defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", + int_aarch64_neon_uqrshrn>; +defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; +defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", + int_aarch64_neon_uqshrn>; +defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; +defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", + TriOpFrag<(add node:$LHS, + (AArch64urshri node:$MHS, node:$RHS))> >; +defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll", + BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; +defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; +defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", + TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + +// SHRN patterns for when a logical right shift was used instead of arithmetic +// (the immediate guarantees no sign bits actually end up in the result so it +// doesn't matter). +def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))), + (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>; +def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))), + (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>; +def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))), + (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>; + +def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd), + (trunc (AArch64vlshr (v8i16 V128:$Rn), + vecshiftR16Narrow:$imm)))), + (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR16Narrow:$imm)>; +def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd), + (trunc (AArch64vlshr (v4i32 V128:$Rn), + vecshiftR32Narrow:$imm)))), + (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; +def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd), + (trunc (AArch64vlshr (v2i64 V128:$Rn), + vecshiftR64Narrow:$imm)))), + (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; + +// Vector sign and zero extensions are implemented with SSHLL and USSHLL. +// Anyexts are implemented as zexts. +def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; +// Also match an extend from the upper half of a 128 bit source register. +def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (USHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (USHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (SSHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (USHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (USHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (SSHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (USHLLv4i32_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (USHLLv4i32_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (SSHLLv4i32_shift V128:$Rn, (i32 0))>; + +// Vector shift sxtl aliases +def : InstAlias<"sxtl.8h $dst, $src1", + (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.8h, $src1.8b", + (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl.4s $dst, $src1", + (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.4s, $src1.4h", + (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl.2d $dst, $src1", + (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.2d, $src1.2s", + (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; + +// Vector shift sxtl2 aliases +def : InstAlias<"sxtl2.8h $dst, $src1", + (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.8h, $src1.16b", + (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2.4s $dst, $src1", + (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.4s, $src1.8h", + (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2.2d $dst, $src1", + (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.2d, $src1.4s", + (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; + +// Vector shift uxtl aliases +def : InstAlias<"uxtl.8h $dst, $src1", + (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.8h, $src1.8b", + (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl.4s $dst, $src1", + (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.4s, $src1.4h", + (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl.2d $dst, $src1", + (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.2d, $src1.2s", + (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; + +// Vector shift uxtl2 aliases +def : InstAlias<"uxtl2.8h $dst, $src1", + (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.8h, $src1.16b", + (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2.4s $dst, $src1", + (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.4s, $src1.8h", + (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2.2d $dst, $src1", + (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.2d, $src1.4s", + (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; + +// If an integer is about to be converted to a floating point value, +// just load it on the floating point unit. +// These patterns are more complex because floating point loads do not +// support sign extension. +// The sign extension has to be explicitly added and is only supported for +// one step: byte-to-half, half-to-word, word-to-doubleword. +// SCVTF GPR -> FPR is 9 cycles. +// SCVTF FPR -> FPR is 4 cyclces. +// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles. +// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR +// and still being faster. +// However, this is not good for code size. +// 8-bits -> float. 2 sizes step-up. +class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> + : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))), + (SCVTFv1i32 (f32 (EXTRACT_SUBREG + (SSHLLv4i16_shift + (f64 + (EXTRACT_SUBREG + (SSHLLv8i8_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + bsub), + 0), + dsub)), + 0), + ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; + +def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), + (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; +def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext), + (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>; +def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset), + (LDURBi GPR64sp:$Rn, simm9:$offset)>; + +// 16-bits -> float. 1 size step-up. +class SExtLoadi16CVTf32Pat<dag addrmode, dag INST> + : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))), + (SCVTFv1i32 (f32 (EXTRACT_SUBREG + (SSHLLv4i16_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + hsub), + 0), + ssub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), + (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; +def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), + (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; +def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), + (LDURHi GPR64sp:$Rn, simm9:$offset)>; + +// 32-bits to 32-bits are handled in target specific dag combine: +// performIntToFpCombine. +// 64-bits integer to 32-bits floating point, not possible with +// SCVTF on floating point registers (both source and destination +// must have the same size). + +// Here are the patterns for 8, 16, 32, and 64-bits to double. +// 8-bits -> double. 3 size step-up: give up. +// 16-bits -> double. 2 size step. +class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> + : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))), + (SCVTFv1i64 (f64 (EXTRACT_SUBREG + (SSHLLv2i32_shift + (f64 + (EXTRACT_SUBREG + (SSHLLv4i16_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + hsub), + 0), + dsub)), + 0), + dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + +def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), + (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; +def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), + (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; +def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), + (LDURHi GPR64sp:$Rn, simm9:$offset)>; +// 32-bits -> double. 1 size step-up. +class SExtLoadi32CVTf64Pat<dag addrmode, dag INST> + : Pat <(f64 (sint_to_fp (i32 (load addrmode)))), + (SCVTFv1i64 (f64 (EXTRACT_SUBREG + (SSHLLv2i32_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + ssub), + 0), + dsub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext), + (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>; +def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext), + (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>; +def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>; +def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset), + (LDURSi GPR64sp:$Rn, simm9:$offset)>; + +// 64-bits -> double are handled in target specific dag combine: +// performIntToFpCombine. + + +//---------------------------------------------------------------------------- +// AdvSIMD Load-Store Structure +//---------------------------------------------------------------------------- +defm LD1 : SIMDLd1Multiple<"ld1">; +defm LD2 : SIMDLd2Multiple<"ld2">; +defm LD3 : SIMDLd3Multiple<"ld3">; +defm LD4 : SIMDLd4Multiple<"ld4">; + +defm ST1 : SIMDSt1Multiple<"st1">; +defm ST2 : SIMDSt2Multiple<"st2">; +defm ST3 : SIMDSt3Multiple<"st3">; +defm ST4 : SIMDSt4Multiple<"st4">; + +class Ld1Pat<ValueType ty, Instruction INST> + : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>; + +def : Ld1Pat<v16i8, LD1Onev16b>; +def : Ld1Pat<v8i16, LD1Onev8h>; +def : Ld1Pat<v4i32, LD1Onev4s>; +def : Ld1Pat<v2i64, LD1Onev2d>; +def : Ld1Pat<v8i8, LD1Onev8b>; +def : Ld1Pat<v4i16, LD1Onev4h>; +def : Ld1Pat<v2i32, LD1Onev2s>; +def : Ld1Pat<v1i64, LD1Onev1d>; + +class St1Pat<ValueType ty, Instruction INST> + : Pat<(store ty:$Vt, GPR64sp:$Rn), + (INST ty:$Vt, GPR64sp:$Rn)>; + +def : St1Pat<v16i8, ST1Onev16b>; +def : St1Pat<v8i16, ST1Onev8h>; +def : St1Pat<v4i32, ST1Onev4s>; +def : St1Pat<v2i64, ST1Onev2d>; +def : St1Pat<v8i8, ST1Onev8b>; +def : St1Pat<v4i16, ST1Onev4h>; +def : St1Pat<v2i32, ST1Onev2s>; +def : St1Pat<v1i64, ST1Onev1d>; + +//--- +// Single-element +//--- + +defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>; +defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>; +defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>; +defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>; +let mayLoad = 1, hasSideEffects = 0 in { +defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>; +defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>; +defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>; +defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>; +defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>; +defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>; +defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>; +defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>; +defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>; +defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>; +defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>; +defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>; +defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>; +defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>; +defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>; +defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>; +} + +def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))), + (LD1Rv8b GPR64sp:$Rn)>; +def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))), + (LD1Rv16b GPR64sp:$Rn)>; +def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))), + (LD1Rv2s GPR64sp:$Rn)>; +def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))), + (LD1Rv4s GPR64sp:$Rn)>; +def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), + (LD1Rv2d GPR64sp:$Rn)>; +def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), + (LD1Rv1d GPR64sp:$Rn)>; +// Grab the floating point version too +def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))), + (LD1Rv2s GPR64sp:$Rn)>; +def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))), + (LD1Rv4s GPR64sp:$Rn)>; +def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), + (LD1Rv2d GPR64sp:$Rn)>; +def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), + (LD1Rv1d GPR64sp:$Rn)>; +def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; + +class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction LD1> + : Pat<(vector_insert (VTy VecListOne128:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>; + +def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>; +def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>; +def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>; +def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>; +def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>; +def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>; +def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>; + +class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction LD1> + : Pat<(vector_insert (VTy VecListOne64:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (EXTRACT_SUBREG + (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub), + VecIndex:$idx, GPR64sp:$Rn), + dsub)>; + +def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>; +def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>; +def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>; +def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>; +def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>; + + +defm LD1 : SIMDLdSt1SingleAliases<"ld1">; +defm LD2 : SIMDLdSt2SingleAliases<"ld2">; +defm LD3 : SIMDLdSt3SingleAliases<"ld3">; +defm LD4 : SIMDLdSt4SingleAliases<"ld4">; + +// Stores +defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>; +defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; +defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; +defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; + +let AddedComplexity = 19 in +class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction ST1> + : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>; + +def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>; +def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>; +def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>; +def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>; +def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>; +def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>; +def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>; + +let AddedComplexity = 19 in +class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction ST1> + : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn)>; + +def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>; +def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>; +def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>; +def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>; +def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>; + +multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction ST1, + int offset> { + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, offset), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn, XZR)>; + + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, GPR64:$Rm), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn, $Rm)>; +} + +defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>; +defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST, + 2>; +defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>; +defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>; +defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>; +defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>; +defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>; + +multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex, + ValueType VTy, ValueType STy, Instruction ST1, + int offset> { + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, offset), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>; + + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, GPR64:$Rm), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>; +} + +defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST, + 1>; +defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST, + 2>; +defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>; +defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>; +defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>; +defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>; +defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>; + +let mayStore = 1, hasSideEffects = 0 in { +defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; +defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; +defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>; +defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>; +defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>; +defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>; +defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>; +defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>; +defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>; +defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>; +defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>; +defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>; +} + +defm ST1 : SIMDLdSt1SingleAliases<"st1">; +defm ST2 : SIMDLdSt2SingleAliases<"st2">; +defm ST3 : SIMDLdSt3SingleAliases<"st3">; +defm ST4 : SIMDLdSt4SingleAliases<"st4">; + +//---------------------------------------------------------------------------- +// Crypto extensions +//---------------------------------------------------------------------------- + +def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>; +def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>; +def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>; +def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; + +def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>; +def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>; +def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>; +def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>; +def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>; +def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>; +def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>; + +def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>; +def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>; +def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>; + +//---------------------------------------------------------------------------- +// Compiler-pseudos +//---------------------------------------------------------------------------- +// FIXME: Like for X86, these should go in their own separate .td file. + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. But any other 32-bit operation will zero-extend +// up to 64 bits. +// FIXME: X86 also checks for CMOV here. Do we need something similar? +def def32 : PatLeaf<(i32 GPR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; + +// For an anyext, we don't care what the high bits are, so we can perform an +// INSERT_SUBREF into an IMPLICIT_DEF. +def : Pat<(i64 (anyext GPR32:$src)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; + +// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and +// then assert the extension has happened. +def : Pat<(i64 (zext GPR32:$src)), + (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; + +// To sign extend, we use a signed bitfield move instruction (SBFM) on the +// containing super-reg. +def : Pat<(i64 (sext GPR32:$src)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>; + +def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_sext_i8 imm0_31:$imm)))>; +def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i8 imm0_63:$imm)))>; + +def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_sext_i16 imm0_31:$imm)))>; +def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i16 imm0_63:$imm)))>; + +def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), + (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i32 imm0_63:$imm)))>; + +// sra patterns have an AddedComplexity of 10, so make sure we have a higher +// AddedComplexity for the following patterns since we want to match sext + sra +// patterns before we attempt to match a single sra node. +let AddedComplexity = 20 in { +// We support all sext + sra combinations which preserve at least one bit of the +// original value which is to be sign extended. E.g. we support shifts up to +// bitwidth-1 bits. +def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)), + (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>; +def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)), + (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>; + +def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)), + (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>; +def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)), + (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>; + +def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), + (i64 imm0_31:$imm), 31)>; +} // AddedComplexity = 20 + +// To truncate, we can simply extract from a subregister. +def : Pat<(i32 (trunc GPR64sp:$src)), + (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>; + +// __builtin_trap() uses the BRK instruction on AArch64. +def : Pat<(trap), (BRK 1)>; + +// Conversions within AdvSIMD types in the same register size are free. +// But because we need a consistent lane ordering, in big endian many +// conversions require one or more REV instructions. +// +// Consider a simple memory load followed by a bitconvert then a store. +// v0 = load v2i32 +// v1 = BITCAST v2i32 v0 to v4i16 +// store v4i16 v2 +// +// In big endian mode every memory access has an implicit byte swap. LDR and +// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that +// is, they treat the vector as a sequence of elements to be byte-swapped. +// The two pairs of instructions are fundamentally incompatible. We've decided +// to use LD1/ST1 only to simplify compiler implementation. +// +// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes +// the original code sequence: +// v0 = load v2i32 +// v1 = REV v2i32 (implicit) +// v2 = BITCAST v2i32 v1 to v4i16 +// v3 = REV v4i16 v2 (implicit) +// store v4i16 v3 +// +// But this is now broken - the value stored is different to the value loaded +// due to lane reordering. To fix this, on every BITCAST we must perform two +// other REVs: +// v0 = load v2i32 +// v1 = REV v2i32 (implicit) +// v2 = REV v2i32 +// v3 = BITCAST v2i32 v2 to v4i16 +// v4 = REV v4i16 +// v5 = REV v4i16 v4 (implicit) +// store v4i16 v5 +// +// This means an extra two instructions, but actually in most cases the two REV +// instructions can be combined into one. For example: +// (REV64_2s (REV64_4h X)) === (REV32_4h X) +// +// There is also no 128-bit REV instruction. This must be synthesized with an +// EXT instruction. +// +// Most bitconverts require some sort of conversion. The only exceptions are: +// a) Identity conversions - vNfX <-> vNiX +// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX +// + +// Natural vector casts (64 bit) +def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; + +// Natural vector casts (128 bit) +def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; + +def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i8 (bitconvert GPR64:$Xn)), + (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4i16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v2i32 (bitconvert GPR64:$Xn)), + (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v2f32 (bitconvert GPR64:$Xn)), + (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; + +def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), + (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), + (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), + (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +} +def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; + +def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), + (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>; +def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), + (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>; +def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), + (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>; +def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; + +let Predicates = [IsLE] in { +def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), + (v1i64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), + (v1i64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), + (v1i64 (REV64v2i32 FPR64:$src))>; +} +def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), + (v2i32 (REV32v8i8 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), + (v2i32 (REV64v4i16 FPR64:$src))>; +} +def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), + (v4i16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), + (v4f16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +} + + + +let Predicates = [IsLE] in { +def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), + (v8i8 (REV32v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), + (v8i8 (REV32v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), + (f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), + (f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), + (f64 (REV64v8i8 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; +} +def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), + (v1f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), + (v1f64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), + (v1f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; +} +def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), + (v2f32 (REV32v8i8 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), + (v2f32 (REV64v4i16 FPR64:$src))>; +} +def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), + (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; +def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), + (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), + (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; +def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), + (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), + (f128 (EXTv16i8 (REV64v16i8 FPR128:$src), + (REV64v16i8 FPR128:$src), (i32 8)))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), + (v2f64 (EXTv16i8 FPR128:$src, + FPR128:$src, (i32 8)))>; +def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), + (v2f64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), + (v2f64 (REV64v16i8 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), + (v2f64 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), + (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), + (v4f32 (REV32v16i8 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), + (v4f32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), + (v4f32 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), + (v2i64 (EXTv16i8 FPR128:$src, + FPR128:$src, (i32 8)))>; +def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), + (v2i64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), + (v2i64 (REV64v16i8 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), + (v2i64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; +} +def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), + (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), + (i32 8)))>; +def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), + (v4i32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), + (v4i32 (REV32v16i8 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), + (v4i32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; +} +def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), + (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), + (v8i16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), + (v8i16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), + (v8i16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), + (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), + (v8f16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), + (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src), + (REV64v16i8 FPR128:$src), + (i32 8)))>; +def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), + (v16i8 (REV64v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), + (v16i8 (REV32v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), + (v16i8 (REV64v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), + (v16i8 (REV32v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; +} + +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; + +def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; + +// A 64-bit subvector insert to the first 128-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + +// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64 +// or v2f32. +def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)), + (vector_extract (v2i64 FPR128:$Rn), (i64 1)))), + (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>; +def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), + (vector_extract (v2f64 FPR128:$Rn), (i64 1)))), + (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>; + // vector_extract on 64-bit vectors gets promoted to a 128 bit vector, + // so we match on v4f32 here, not v2f32. This will also catch adding + // the low two lanes of a true v4f32 vector. +def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), + (vector_extract (v4f32 FPR128:$Rn), (i64 1))), + (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; + +// Scalar 64-bit shifts in FPR64 registers. +def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; + +// Patterns for nontemporal/no-allocate stores. +// We have to resort to tricks to turn a single-input store into a store pair, +// because there is no single-input nontemporal store, only STNP. +let Predicates = [IsLE] in { +let AddedComplexity = 15 in { +class NTStore128Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR128:$Rt), + (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), + (CPYi64 FPR128:$Rt, (i64 1)), + GPR64sp:$Rn, simm7s8:$offset)>; + +def : NTStore128Pat<v2i64>; +def : NTStore128Pat<v4i32>; +def : NTStore128Pat<v8i16>; +def : NTStore128Pat<v16i8>; + +class NTStore64Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR64:$Rt), + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), + (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + GPR64sp:$Rn, simm7s4:$offset)>; + +// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? +def : NTStore64Pat<v1f64>; +def : NTStore64Pat<v1i64>; +def : NTStore64Pat<v2i32>; +def : NTStore64Pat<v4i16>; +def : NTStore64Pat<v8i8>; + +def : Pat<(nontemporalstore GPR64:$Rt, + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + GPR64sp:$Rn, simm7s4:$offset)>; +} // AddedComplexity=10 +} // Predicates = [IsLE] + +// Tail call return handling. These are all compiler pseudo-instructions, +// so no encoding information or anything like that. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; +} + +def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), + (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>; +def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; + +include "AArch64InstrAtomics.td" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp new file mode 100644 index 0000000..566aa2c --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -0,0 +1,1834 @@ +//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ldst-opt" + +/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine +/// load / store instructions to form ldp / stp instructions. + +STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); +STATISTIC(NumPostFolded, "Number of post-index updates folded"); +STATISTIC(NumPreFolded, "Number of pre-index updates folded"); +STATISTIC(NumUnscaledPairCreated, + "Number of load/store from unscaled generated"); +STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); +STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); +STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); + +static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", + cl::init(20), cl::Hidden); + +namespace llvm { +void initializeAArch64LoadStoreOptPass(PassRegistry &); +} + +#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" + +namespace { + +typedef struct LdStPairFlags { + // If a matching instruction is found, MergeForward is set to true if the + // merge is to remove the first instruction and replace the second with + // a pair-wise insn, and false if the reverse is true. + bool MergeForward; + + // SExtIdx gives the index of the result of the load pair that must be + // extended. The value of SExtIdx assumes that the paired load produces the + // value in this order: (I, returned iterator), i.e., -1 means no value has + // to be extended, 0 means I, and 1 means the returned iterator. + int SExtIdx; + + LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + + void setMergeForward(bool V = true) { MergeForward = V; } + bool getMergeForward() const { return MergeForward; } + + void setSExtIdx(int V) { SExtIdx = V; } + int getSExtIdx() const { return SExtIdx; } + +} LdStPairFlags; + +struct AArch64LoadStoreOpt : public MachineFunctionPass { + static char ID; + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { + initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); + } + + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + const AArch64Subtarget *Subtarget; + + // Scan the instructions looking for a load/store that can be combined + // with the current instruction into a load/store pair. + // Return the matching instruction if one is found, else MBB->end(). + MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, + LdStPairFlags &Flags, + unsigned Limit); + + // Scan the instructions looking for a store that writes to the address from + // which the current load instruction reads. Return true if one is found. + bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI); + + // Merge the two instructions indicated into a single pair-wise instruction. + // If MergeForward is true, erase the first instruction and fold its + // operation into the second. If false, the reverse. Return the instruction + // following the first instruction (which may change during processing). + MachineBasicBlock::iterator + mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags); + + // Promote the load that reads directly from the address stored to. + MachineBasicBlock::iterator + promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI); + + // Scan the instruction list to find a base register update that can + // be combined with the current instruction (a load or store) using + // pre or post indexed addressing with writeback. Scan forwards. + MachineBasicBlock::iterator + findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, + int UnscaledOffset); + + // Scan the instruction list to find a base register update that can + // be combined with the current instruction (a load or store) using + // pre or post indexed addressing with writeback. Scan backwards. + MachineBasicBlock::iterator + findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); + + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); + + // Merge a pre- or post-index base register update into a ld/st instruction. + MachineBasicBlock::iterator + mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + + // Find and promote load instructions which read directly from store. + bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return AARCH64_LOAD_STORE_OPT_NAME; + } +}; +char AArch64LoadStoreOpt::ID = 0; +} // namespace + +INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", + AARCH64_LOAD_STORE_OPT_NAME, false, false) + +static bool isUnscaledLdSt(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + return true; + } +} + +static bool isUnscaledLdSt(MachineInstr *MI) { + return isUnscaledLdSt(MI->getOpcode()); +} + +static unsigned getBitExtrOpcode(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode."); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return AArch64::UBFMWri; + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + return AArch64::SBFMWri; + } +} + +static bool isNarrowStore(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return true; + } +} + +static bool isNarrowStore(MachineInstr *MI) { + return isNarrowStore(MI->getOpcode()); +} + +static bool isNarrowLoad(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + return true; + } +} + +static bool isNarrowLoad(MachineInstr *MI) { + return isNarrowLoad(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: + return 4; + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: + return 8; + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: + return 16; + } +} + +static unsigned getMatchingNonSExtOpcode(unsigned Opc, + bool *IsValidLdStrOpc = nullptr) { + if (IsValidLdStrOpc) + *IsValidLdStrOpc = true; + switch (Opc) { + default: + if (IsValidLdStrOpc) + *IsValidLdStrOpc = false; + return UINT_MAX; + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: + return Opc; + case AArch64::LDRSWui: + return AArch64::LDRWui; + case AArch64::LDURSWi: + return AArch64::LDURWi; + case AArch64::LDRSBWui: + return AArch64::LDRBBui; + case AArch64::LDRSHWui: + return AArch64::LDRHHui; + case AArch64::LDURSBWi: + return AArch64::LDURBBi; + case AArch64::LDURSHWi: + return AArch64::LDURHHi; + } +} + +static unsigned getMatchingPairOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no pairwise equivalent!"); + case AArch64::STRSui: + case AArch64::STURSi: + return AArch64::STPSi; + case AArch64::STRDui: + case AArch64::STURDi: + return AArch64::STPDi; + case AArch64::STRQui: + case AArch64::STURQi: + return AArch64::STPQi; + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; + case AArch64::STRWui: + case AArch64::STURWi: + return AArch64::STPWi; + case AArch64::STRXui: + case AArch64::STURXi: + return AArch64::STPXi; + case AArch64::LDRSui: + case AArch64::LDURSi: + return AArch64::LDPSi; + case AArch64::LDRDui: + case AArch64::LDURDi: + return AArch64::LDPDi; + case AArch64::LDRQui: + case AArch64::LDURQi: + return AArch64::LDPQi; + case AArch64::LDRWui: + case AArch64::LDURWi: + return AArch64::LDPWi; + case AArch64::LDRXui: + case AArch64::LDURXi: + return AArch64::LDPXi; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return AArch64::LDPSWi; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + +static unsigned isMatchingStore(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + unsigned LdOpc = LoadInst->getOpcode(); + unsigned StOpc = StoreInst->getOpcode(); + switch (LdOpc) { + default: + llvm_unreachable("Unsupported load instruction!"); + case AArch64::LDRBBui: + return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || + StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURBBi: + return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || + StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRHHui: + return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || + StOpc == AArch64::STRXui; + case AArch64::LDURHHi: + return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || + StOpc == AArch64::STURXi; + case AArch64::LDRWui: + return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURWi: + return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRXui: + return StOpc == AArch64::STRXui; + case AArch64::LDURXi: + return StOpc == AArch64::STURXi; + } +} + +static unsigned getPreIndexedOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no pre-indexed equivalent!"); + case AArch64::STRSui: + return AArch64::STRSpre; + case AArch64::STRDui: + return AArch64::STRDpre; + case AArch64::STRQui: + return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; + case AArch64::STRWui: + return AArch64::STRWpre; + case AArch64::STRXui: + return AArch64::STRXpre; + case AArch64::LDRSui: + return AArch64::LDRSpre; + case AArch64::LDRDui: + return AArch64::LDRDpre; + case AArch64::LDRQui: + return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; + case AArch64::LDRWui: + return AArch64::LDRWpre; + case AArch64::LDRXui: + return AArch64::LDRXpre; + case AArch64::LDRSWui: + return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; + } +} + +static unsigned getPostIndexedOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no post-indexed wise equivalent!"); + case AArch64::STRSui: + return AArch64::STRSpost; + case AArch64::STRDui: + return AArch64::STRDpost; + case AArch64::STRQui: + return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; + case AArch64::STRWui: + return AArch64::STRWpost; + case AArch64::STRXui: + return AArch64::STRXpost; + case AArch64::LDRSui: + return AArch64::LDRSpost; + case AArch64::LDRDui: + return AArch64::LDRDpost; + case AArch64::LDRQui: + return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; + case AArch64::LDRWui: + return AArch64::LDRWpost; + case AArch64::LDRXui: + return AArch64::LDRXpost; + case AArch64::LDRSWui: + return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; + } +} + +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); +} + +static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + int UnscaledStOffset = isUnscaledLdSt(StoreInst) + ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + return (UnscaledStOffset <= UnscaledLdOffset) && + (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); +} + +// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. +static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, + MachineInstr *Op1) { + assert(MI->memoperands_empty() && "expected a new machineinstr"); + size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + + (Op1->memoperands_end() - Op1->memoperands_begin()); + + MachineFunction *MF = MI->getParent()->getParent(); + MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); + MachineSDNode::mmo_iterator MemEnd = + std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); + MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); + MI->setMemRefs(MemBegin, MemEnd); +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + int SExtIdx = Flags.getSExtIdx(); + unsigned Opc = + SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); + bool IsUnscaled = isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + + bool MergeForward = Flags.getMergeForward(); + unsigned NewOpc = getMatchingPairOpcode(Opc); + // Insert our new paired instruction after whichever of the paired + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (getLdStOffsetOp(I).getImm() == + getLdStOffsetOp(Paired).getImm() + OffsetStride) { + RtMI = Paired; + Rt2MI = I; + // Here we swapped the assumption made for SExtIdx. + // I.e., we turn ldp I, Paired into ldp Paired, I. + // Update the index accordingly. + if (SExtIdx != -1) + SExtIdx = (SExtIdx + 1) % 2; + } else { + RtMI = I; + Rt2MI = Paired; + } + + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + + if (isNarrowLoad(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // When merging small (< 32 bit) loads for big-endian targets, the order of + // the component parts gets swapped. + if (!Subtarget->isLittleEndian()) + std::swap(RtMI, Rt2MI); + // Construct the new load instruction. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + + // Copy MachineMemOperands from the original loads. + concatenateMemOperands(NewMemMI, I, Paired); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + int Width = getMemScale(I) == 1 ? 8 : 16; + int LSBLow = 0; + int LSBHigh = Width; + int ImmsLow = LSBLow + Width - 1; + int ImmsHigh = LSBHigh + Width - 1; + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { + // Create the bitfield extract for high bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + } else { + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + + // Create the bitfield extract for high bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } + + // Construct the new instruction. + MachineInstrBuilder MIB; + if (isNarrowStore(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(I)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + // Copy MachineMemOperands from the original stores. + concatenateMemOperands(MIB, I, Paired); + } else { + // Handle Unscaled + if (IsUnscaled) + OffsetImm /= OffsetStride; + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtMI)) + .addOperand(getLdStRegOp(Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + } + + (void)MIB; + + // FIXME: Do we need/want to copy the mem operands from the source + // instructions? Probably. What uses them after this? + + DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + + if (SExtIdx != -1) { + // Generate the sign extension for the proper result of the ldp. + // I.e., with X1, that would be: + // %W1<def> = KILL %W1, %X1<imp-def> + // %X1<def> = SBFMXri %X1<kill>, 0, 31 + MachineOperand &DstMO = MIB->getOperand(SExtIdx); + // Right now, DstMO has the extended register, since it comes from an + // extended opcode. + unsigned DstRegX = DstMO.getReg(); + // Get the W variant of that register. + unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); + // Update the result of LDP to use the W instead of the X variant. + DstMO.setReg(DstRegW); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + // Make the machine verifier happy by providing a definition for + // the X register. + // Insert this definition right after the generated LDP, i.e., before + // InsertionPoint. + MachineInstrBuilder MIBKill = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(TargetOpcode::KILL), DstRegW) + .addReg(DstRegW) + .addReg(DstRegX, RegState::Define); + MIBKill->getOperand(2).setImplicit(); + // Create the sign extension. + MachineInstrBuilder MIBSXTW = + BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::SBFMXri), DstRegX) + .addReg(DstRegX) + .addImm(0) + .addImm(31); + (void)MIBSXTW; + DEBUG(dbgs() << " Extend operand:\n "); + DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } else { + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + + return NextI; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI) { + MachineBasicBlock::iterator NextI = LoadI; + ++NextI; + + int LoadSize = getMemScale(LoadI); + int StoreSize = getMemScale(StoreI); + unsigned LdRt = getLdStRegOp(LoadI).getReg(); + unsigned StRt = getLdStRegOp(StoreI).getReg(); + bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); + + assert((IsStoreXReg || + TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && + "Unexpected RegClass"); + + MachineInstr *BitExtMI; + if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { + // Remove the load, if the destination register of the loads is the same + // register for stored value. + if (StRt == LdRt && LoadSize == 8) { + DEBUG(dbgs() << "Remove load instruction:\n "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << "\n"); + LoadI->eraseFromParent(); + return NextI; + } + // Replace the load with a mov if the load and store are in the same size. + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) + .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) + .addReg(StRt) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // FIXME: Currently we disable this transformation in big-endian targets as + // performance and correctness are verified only in little-endian. + if (!Subtarget->isLittleEndian()) + return NextI; + bool IsUnscaled = isUnscaledLdSt(LoadI); + assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + assert(LoadSize <= StoreSize && "Invalid load size"); + int UnscaledLdOffset = IsUnscaled + ? getLdStOffsetOp(LoadI).getImm() + : getLdStOffsetOp(LoadI).getImm() * LoadSize; + int UnscaledStOffset = IsUnscaled + ? getLdStOffsetOp(StoreI).getImm() + : getLdStOffsetOp(StoreI).getImm() * StoreSize; + int Width = LoadSize * 8; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; + unsigned DestReg = IsStoreXReg + ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, + &AArch64::GPR64RegClass) + : LdRt; + + assert((UnscaledLdOffset >= UnscaledStOffset && + (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && + "Invalid offset"); + + Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + Imms = Immr + Width - 1; + if (UnscaledLdOffset == UnscaledStOffset) { + uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N + | ((Immr) << 6) // immr + | ((Imms) << 0) // imms + ; + + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), + DestReg) + .addReg(StRt) + .addImm(AndMaskEncoded); + } else { + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), + DestReg) + .addReg(StRt) + .addImm(Immr) + .addImm(Imms); + } + } + + DEBUG(dbgs() << "Promoting load by replacing :\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + LoadI->eraseFromParent(); + return NextI; +} + +/// trackRegDefsUses - Remember what registers the specified instruction uses +/// and modifies. +static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, + BitVector &UsedRegs, + const TargetRegisterInfo *TRI) { + for (const MachineOperand &MO : MI->operands()) { + if (MO.isRegMask()) + ModifiedRegs.setBitsNotInMask(MO.getRegMask()); + + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef()) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + ModifiedRegs.set(*AI); + } else { + assert(MO.isUse() && "Reg operand not a def and not a use?!?"); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UsedRegs.set(*AI); + } + } +} + +static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + if (IsUnscaled) + Offset /= OffsetStride; + + return Offset <= 63 && Offset >= -64; +} + +// Do alignment, specialized to power of 2 and for signed ints, +// avoiding having to do a C-style cast from uint_64t to int when +// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. +// FIXME: Move this function to include/MathExtras.h? +static int alignTo(int Num, int PowOf2) { + return (Num + PowOf2 - 1) & ~(PowOf2 - 1); +} + +static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, + const AArch64InstrInfo *TII) { + // One of the instructions must modify memory. + if (!MIa->mayStore() && !MIb->mayStore()) + return false; + + // Both instructions must be memory operations. + if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) + return false; + + return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); +} + +static bool mayAlias(MachineInstr *MIa, + SmallVectorImpl<MachineInstr *> &MemInsns, + const AArch64InstrInfo *TII) { + for (auto &MIb : MemInsns) + if (mayAlias(MIa, MIb, TII)) + return true; + + return false; +} + +bool AArch64LoadStoreOpt::findMatchingStore( + MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI) { + MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + + // Track which registers have been modified and used between the first insn + // and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + for (unsigned Count = 0; MBBI != E && Count < Limit;) { + --MBBI; + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + // Now that we know this is a real instruction, count it. + ++Count; + + // If the load instruction reads directly from the address to which the + // store instruction writes and the stored value is not modified, we can + // promote the load. Since we do not handle stores with pre-/post-index, + // it's unnecessary to check if BaseReg is modified by the store itself. + if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + BaseReg == getLdStBaseOp(MI).getReg() && + isLdOffsetInRangeOfSt(FirstMI, MI) && + !ModifiedRegs[getLdStRegOp(MI).getReg()]) { + StoreI = MBBI; + return true; + } + + if (MI->isCall()) + return false; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return false; + + // If we encounter a store aliased with the load, return early. + if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + return false; + } + return false; +} + +/// findMatchingInsn - Scan the instructions looking for a load/store that can +/// be combined with the current instruction into a load/store pair. +MachineBasicBlock::iterator +AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, + LdStPairFlags &Flags, unsigned Limit) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + ++MBBI; + + unsigned Opc = FirstMI->getOpcode(); + bool MayLoad = FirstMI->mayLoad(); + bool IsUnscaled = isUnscaledLdSt(FirstMI); + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + int Offset = getLdStOffsetOp(FirstMI).getImm(); + bool IsNarrowStore = isNarrowStore(Opc); + + // For narrow stores, find only the case where the stored value is WZR. + if (IsNarrowStore && Reg != AArch64::WZR) + return E; + + // Early exit if the first instruction modifies the base register. + // e.g., ldr x0, [x0] + if (FirstMI->modifiesRegister(BaseReg, TRI)) + return E; + + // Early exit if the offset if not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!(isNarrowLoad(Opc) || IsNarrowStore) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + return E; + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + // Remember any instructions that read/write memory between FirstMI and MI. + SmallVector<MachineInstr *, 4> MemInsns; + + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + bool CanMergeOpc = Opc == MI->getOpcode(); + Flags.setSExtIdx(-1); + if (!CanMergeOpc) { + bool IsValidLdStrOpc; + unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); + // Opc will be the first instruction in the pair. + Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); + CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); + } + + if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { + assert(MI->mayLoadOrStore() && "Expected memory operation."); + // If we've found another instruction with the same opcode, check to see + // if the base and offset are compatible with our starting instruction. + // These instructions all have scaled immediate operands, so we just + // check for +1/-1. Make sure to check the new instruction offset is + // actually an immediate and not a symbolic reference destined for + // a relocation. + // + // Pairwise instructions have a 7-bit signed offset field. Single insns + // have a 12-bit unsigned offset field. To be a valid combine, the + // final offset must be in range. + unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + int MIOffset = getLdStOffsetOp(MI).getImm(); + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || + (Offset + OffsetStride == MIOffset))) { + int MinOffset = Offset < MIOffset ? Offset : MIOffset; + // If this is a volatile load/store that otherwise matched, stop looking + // as something is going on that we don't have enough information to + // safely transform. Similarly, stop if we see a hint to avoid pairs. + if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + return E; + // If the resultant immediate offset of merging these instructions + // is out of range for a pairwise instruction, bail and keep looking. + bool MIIsUnscaled = isUnscaledLdSt(MI); + bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); + if (!IsNarrowLoad && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } + + if (IsNarrowLoad || IsNarrowStore) { + // If the alignment requirements of the scaled wide load/store + // instruction can't express the offset of the scaled narrow + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } + } + // If the destination register of the loads is the same register, bail + // and keep looking. A load-pair instruction with both destination + // registers the same is UNPREDICTABLE and will result in an exception. + // For narrow stores, allow only when the stored value is the same + // (i.e., WZR). + if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || + (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } + + // If the Rt of the second instruction was not modified or used between + // the two instructions and none of the instructions between the second + // and first alias with the second, we can combine the second into the + // first. + if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && + !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && + !mayAlias(MI, MemInsns, TII)) { + Flags.setMergeForward(false); + return MBBI; + } + + // Likewise, if the Rt of the first instruction is not modified or used + // between the two instructions and none of the instructions between the + // first and the second alias with the first, we can combine the first + // into the second. + if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && + !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && + !mayAlias(FirstMI, MemInsns, TII)) { + Flags.setMergeForward(true); + return MBBI; + } + // Unable to combine these instructions due to interference in between. + // Keep looking. + } + } + + // If the instruction wasn't a matching load or store. Stop searching if we + // encounter a call instruction that might modify memory. + if (MI->isCall()) + return E; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return E; + + // Update list of instructions that read/write memory. + if (MI->mayLoadOrStore()) + MemInsns.push_back(MI); + } + return E; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { + assert((Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) && + "Unexpected base register update instruction to merge!"); + MachineBasicBlock::iterator NextI = I; + // Return the instruction following the merged instruction, which is + // the instruction following our unmerged load. Unless that's the add/sub + // instruction we're merging, in which case it's the one after that. + if (++NextI == Update) + ++NextI; + + int Value = Update->getOperand(2).getImm(); + assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && + "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); + if (Update->getOpcode() == AArch64::SUBXri) + Value = -Value; + + unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) + : getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + int Scale = getMemScale(I); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } + (void)MIB; + + if (IsPreIdx) + DEBUG(dbgs() << "Creating pre-indexed load/store."); + else + DEBUG(dbgs() << "Creating post-indexed load/store."); + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Update->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions for the block. + I->eraseFromParent(); + Update->eraseFromParent(); + + return NextI; +} + +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { + switch (MI->getOpcode()) { + default: + break; + case AArch64::SUBXri: + // Negate the offset for a SUB instruction. + Offset *= -1; + // FALLTHROUGH + case AArch64::ADDXri: + // Make sure it's a vanilla immediate operand, not a relocation or + // anything else we can't handle. + if (!MI->getOperand(2).isImm()) + break; + // Watch out for 1 << 12 shifted value. + if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) + break; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + int Scale = getMemScale(MemMI); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; + } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; + break; + } + return false; +} + +MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineInstr *MemMI = I; + MachineBasicBlock::iterator MBBI = I; + + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); + + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) + return E; + + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + ++MBBI; + for (unsigned Count = 0; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + // If we found a match, return it. + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) + return MBBI; + + // Update the status of what the instruction clobbered and used. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is used or modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) + return E; + } + return E; +} + +MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( + MachineBasicBlock::iterator I, unsigned Limit) { + MachineBasicBlock::iterator B = I->getParent()->begin(); + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineInstr *MemMI = I; + MachineBasicBlock::iterator MBBI = I; + + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int Offset = getLdStOffsetOp(MemMI).getImm(); + + // If the load/store is the first instruction in the block, there's obviously + // not any matching update. Ditto if the memory offset isn't zero. + if (MBBI == B || Offset != 0) + return E; + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + --MBBI; + for (unsigned Count = 0; MBBI != B; --MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + // If we found a match, return it. + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) + return MBBI; + + // Update the status of what the instruction clobbered and used. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is used or modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) + return E; + } + return E; +} + +bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + // If this is a volatile load, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm. + // FIXME: It is possible to extend it to handle reg+reg cases. + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Look backward up to ScanLimit instructions. + MachineBasicBlock::iterator StoreI; + if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + ++NumLoadsFromStoresPromoted; + // Promote the load. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = promoteLoadFromStore(MBBI, StoreI); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isNarrowLoad(MI)) { + ++NumNarrowLoadsPromoted; + } else if (isNarrowStore(MI)) { + ++NumZeroStoresPromoted; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { + bool Modified = false; + // Three tranformations to do here: + // 1) Find loads that directly read from stores and promote them by + // replacing with mov instructions. If the store is wider than the load, + // the load will be replaced with a bitfield extract. + // e.g., + // str w1, [x0, #4] + // ldrh w2, [x0, #6] + // ; becomes + // str w1, [x0, #4] + // lsr w2, w1, #16 + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 3) Find loads and stores that can be merged into a single load or store + // pair instruction. + // e.g., + // ldr x0, [x2] + // ldr x1, [x2, #8] + // ; becomes + // ldp x0, x1, [x2] + // 4) Find base register updates that can be merged into the load or store + // as a base-reg writeback. + // e.g., + // ldr x0, [x2] + // add x2, x2, #4 + // ; becomes + // ldr x0, [x2], #4 + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + if (tryToPromoteLoadFromStore(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::STRBBui: + case AArch64::STRHHui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::STURBBi: + case AArch64::STURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + case AArch64::LDRSWui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + // Do update merging. It's simpler to keep this separate from the above + // switch, though not strictly necessary. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) { + ++MBBI; + break; + } + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 + MachineBasicBlock::iterator Update = + findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); + Modified = true; + ++NumPostFolded; + break; + } + // Don't know how to handle pre/post-index versions, so move to the next + // instruction. + if (isUnscaledLdSt(Opc)) { + ++MBBI; + break; + } + + // Look back to try to find a pre-index instruction. For example, + // add x0, x0, #8 + // ldr x1, [x0] + // merged into: + // ldr x1, [x0, #8]! + Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + Modified = true; + ++NumPreFolded; + break; + } + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); + + // Look forward to try to find a post-index instruction. For example, + // ldr x1, [x0, #64] + // add x0, x0, #64 + // merged into: + // ldr x1, [x0, #64]! + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + Modified = true; + ++NumPreFolded; + break; + } + + // Nothing found. Just move to the next instruction. + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + return Modified; +} + +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + bool ProfitableArch = Subtarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch && !Subtarget->requiresStrictAlign(); +} + +bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); + TRI = Subtarget->getRegisterInfo(); + + bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); + for (auto &MBB : Fn) + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); + + return Modified; +} + +// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep +// loads and stores near one another? + +/// createAArch64LoadStoreOptimizationPass - returns an instance of the +/// load / store optimization pass. +FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { + return new AArch64LoadStoreOpt(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp new file mode 100644 index 0000000..2b4cdf1 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -0,0 +1,215 @@ +//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower AArch64 MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCInstLower.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration; + +AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer) + : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {} + +MCSymbol * +AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { + return Printer.getSymbol(MO.getGlobal()); +} + +MCSymbol * +AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const { + return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); +} + +MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO, + MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_GOTPAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF; + else + llvm_unreachable("Unexpected target flags with MO_GOT on GV operand"); + } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_TLVPPAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF; + else + llvm_unreachable("Unexpected target flags with MO_TLS on GV operand"); + } else { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_PAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_PAGEOFF; + } + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + return MCOperand::createExpr(Expr); +} + +MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, + MCSymbol *Sym) const { + uint32_t RefFlags = 0; + + if (MO.getTargetFlags() & AArch64II::MO_GOT) + RefFlags |= AArch64MCExpr::VK_GOT; + else if (MO.getTargetFlags() & AArch64II::MO_TLS) { + TLSModel::Model Model; + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + Model = Printer.TM.getTLSModel(GV); + if (!EnableAArch64ELFLocalDynamicTLSGeneration && + Model == TLSModel::LocalDynamic) + Model = TLSModel::GeneralDynamic; + + } else { + assert(MO.isSymbol() && + StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" && + "unexpected external TLS symbol"); + // The general dynamic access sequence is used to get the + // address of _TLS_MODULE_BASE_. + Model = TLSModel::GeneralDynamic; + } + switch (Model) { + case TLSModel::InitialExec: + RefFlags |= AArch64MCExpr::VK_GOTTPREL; + break; + case TLSModel::LocalExec: + RefFlags |= AArch64MCExpr::VK_TPREL; + break; + case TLSModel::LocalDynamic: + RefFlags |= AArch64MCExpr::VK_DTPREL; + break; + case TLSModel::GeneralDynamic: + RefFlags |= AArch64MCExpr::VK_TLSDESC; + break; + } + } else { + // No modifier means this is a generic reference, classified as absolute for + // the cases where it matters (:abs_g0: etc). + RefFlags |= AArch64MCExpr::VK_ABS; + } + + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefFlags |= AArch64MCExpr::VK_PAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefFlags |= AArch64MCExpr::VK_PAGEOFF; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3) + RefFlags |= AArch64MCExpr::VK_G3; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2) + RefFlags |= AArch64MCExpr::VK_G2; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1) + RefFlags |= AArch64MCExpr::VK_G1; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0) + RefFlags |= AArch64MCExpr::VK_G0; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12) + RefFlags |= AArch64MCExpr::VK_HI12; + + if (MO.getTargetFlags() & AArch64II::MO_NC) + RefFlags |= AArch64MCExpr::VK_NC; + + const MCExpr *Expr = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + + AArch64MCExpr::VariantKind RefKind; + RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags); + Expr = AArch64MCExpr::create(Expr, RefKind, Ctx); + + return MCOperand::createExpr(Expr); +} + +MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + if (TargetTriple.isOSDarwin()) + return lowerSymbolOperandDarwin(MO, Sym); + + assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target"); + return lowerSymbolOperandELF(MO, Sym); +} + +bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) const { + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) + return false; + MCOp = MCOperand::createReg(MO.getReg()); + break; + case MachineOperand::MO_RegisterMask: + // Regmasks are like implicit defs. + return false; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: + MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + break; + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + break; + case MachineOperand::MO_MCSymbol: + MCOp = LowerSymbolOperand(MO, MO.getMCSymbol()); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = LowerSymbolOperand( + MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress())); + break; + } + return true; +} + +void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (const MachineOperand &MO : MI->operands()) { + MCOperand MCOp; + if (lowerOperand(MO, MCOp)) + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h new file mode 100644 index 0000000..1e29b80 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h @@ -0,0 +1,52 @@ +//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H + +#include "llvm/ADT/Triple.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { +class AsmPrinter; +class MCAsmInfo; +class MCContext; +class MCInst; +class MCOperand; +class MCSymbol; +class MachineInstr; +class MachineModuleInfoMachO; +class MachineOperand; +class Mangler; + +/// AArch64MCInstLower - This class is used to lower an MachineInstr +/// into an MCInst. +class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower { + MCContext &Ctx; + AsmPrinter &Printer; + Triple TargetTriple; + +public: + AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer); + + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO, + MCSymbol *Sym) const; + MCOperand lowerSymbolOperandELF(const MachineOperand &MO, + MCSymbol *Sym) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + + MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; + MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h new file mode 100644 index 0000000..318f839 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -0,0 +1,172 @@ +//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares AArch64-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/MC/MCLinkerOptimizationHint.h" + +namespace llvm { + +/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and +/// contains private AArch64-specific information for each MachineFunction. +class AArch64FunctionInfo : public MachineFunctionInfo { + + /// Number of bytes of arguments this function has on the stack. If the callee + /// is expected to restore the argument stack this should be a multiple of 16, + /// all usable during a tail call. + /// + /// The alternative would forbid tail call optimisation in some cases: if we + /// want to transfer control from a function with 8-bytes of stack-argument + /// space to a function with 16-bytes then misalignment of this value would + /// make a stack adjustment necessary, which could not be undone by the + /// callee. + unsigned BytesInStackArgArea; + + /// The number of bytes to restore to deallocate space for incoming + /// arguments. Canonically 0 in the C calling convention, but non-zero when + /// callee is expected to pop the args. + unsigned ArgumentStackToRestore; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// determineCalleeSaves(). + bool HasStackFrame; + + /// \brief Amount of stack frame size, not including callee-saved registers. + unsigned LocalStackSize; + + /// \brief Number of TLS accesses using the special (combinable) + /// _TLS_MODULE_BASE_ symbol. + unsigned NumLocalDynamicTLSAccesses; + + /// \brief FrameIndex for start of varargs area for arguments passed on the + /// stack. + int VarArgsStackIndex; + + /// \brief FrameIndex for start of varargs area for arguments passed in + /// general purpose registers. + int VarArgsGPRIndex; + + /// \brief Size of the varargs area for arguments passed in general purpose + /// registers. + unsigned VarArgsGPRSize; + + /// \brief FrameIndex for start of varargs area for arguments passed in + /// floating-point registers. + int VarArgsFPRIndex; + + /// \brief Size of the varargs area for arguments passed in floating-point + /// registers. + unsigned VarArgsFPRSize; + + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + +public: + AArch64FunctionInfo() + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} + + explicit AArch64FunctionInfo(MachineFunction &MF) + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { + (void)MF; + } + + unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } + void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } + + unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; } + void setArgumentStackToRestore(unsigned bytes) { + ArgumentStackToRestore = bytes; + } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } + unsigned getLocalStackSize() const { return LocalStackSize; } + + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } + unsigned getNumLocalDynamicTLSAccesses() const { + return NumLocalDynamicTLSAccesses; + } + + int getVarArgsStackIndex() const { return VarArgsStackIndex; } + void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; } + + int getVarArgsGPRIndex() const { return VarArgsGPRIndex; } + void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; } + + unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; } + void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; } + + int getVarArgsFPRIndex() const { return VarArgsFPRIndex; } + void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; } + + unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } + void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + + typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions; + + const SetOfInstructions &getLOHRelated() const { return LOHRelated; } + + // Shortcuts for LOH related types. + class MILOHDirective { + MCLOHType Kind; + + /// Arguments of this directive. Order matters. + SmallVector<const MachineInstr *, 3> Args; + + public: + typedef SmallVectorImpl<const MachineInstr *> LOHArgs; + + MILOHDirective(MCLOHType Kind, const LOHArgs &Args) + : Kind(Kind), Args(Args.begin(), Args.end()) { + assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!"); + } + + MCLOHType getKind() const { return Kind; } + const LOHArgs &getArgs() const { return Args; } + }; + + typedef MILOHDirective::LOHArgs MILOHArgs; + typedef SmallVector<MILOHDirective, 32> MILOHContainer; + + const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } + + /// Add a LOH directive of this @p Kind and this @p Args. + void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) { + LOHContainerSet.push_back(MILOHDirective(Kind, Args)); + LOHRelated.insert(Args.begin(), Args.end()); + } + +private: + // Hold the lists of LOHs. + MILOHContainer LOHContainerSet; + SetOfInstructions LOHRelated; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp new file mode 100644 index 0000000..5394875 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -0,0 +1,383 @@ +//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file contains the AArch64 / Cortex-A57 specific register allocation +// constraints for use by the PBQP register allocator. +// +// It is essentially a transcription of what is contained in +// AArch64A57FPLoadBalancing, which tries to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply-accumulates. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "aarch64-pbqp" + +#include "AArch64.h" +#include "AArch64PBQPRegAlloc.h" +#include "AArch64RegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocPBQP.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +#ifndef NDEBUG +bool isFPReg(unsigned reg) { + return AArch64::FPR32RegClass.contains(reg) || + AArch64::FPR64RegClass.contains(reg) || + AArch64::FPR128RegClass.contains(reg); +} +#endif + +bool isOdd(unsigned reg) { + switch (reg) { + default: + llvm_unreachable("Register is not from the expected class !"); + case AArch64::S1: + case AArch64::S3: + case AArch64::S5: + case AArch64::S7: + case AArch64::S9: + case AArch64::S11: + case AArch64::S13: + case AArch64::S15: + case AArch64::S17: + case AArch64::S19: + case AArch64::S21: + case AArch64::S23: + case AArch64::S25: + case AArch64::S27: + case AArch64::S29: + case AArch64::S31: + case AArch64::D1: + case AArch64::D3: + case AArch64::D5: + case AArch64::D7: + case AArch64::D9: + case AArch64::D11: + case AArch64::D13: + case AArch64::D15: + case AArch64::D17: + case AArch64::D19: + case AArch64::D21: + case AArch64::D23: + case AArch64::D25: + case AArch64::D27: + case AArch64::D29: + case AArch64::D31: + case AArch64::Q1: + case AArch64::Q3: + case AArch64::Q5: + case AArch64::Q7: + case AArch64::Q9: + case AArch64::Q11: + case AArch64::Q13: + case AArch64::Q15: + case AArch64::Q17: + case AArch64::Q19: + case AArch64::Q21: + case AArch64::Q23: + case AArch64::Q25: + case AArch64::Q27: + case AArch64::Q29: + case AArch64::Q31: + return true; + case AArch64::S0: + case AArch64::S2: + case AArch64::S4: + case AArch64::S6: + case AArch64::S8: + case AArch64::S10: + case AArch64::S12: + case AArch64::S14: + case AArch64::S16: + case AArch64::S18: + case AArch64::S20: + case AArch64::S22: + case AArch64::S24: + case AArch64::S26: + case AArch64::S28: + case AArch64::S30: + case AArch64::D0: + case AArch64::D2: + case AArch64::D4: + case AArch64::D6: + case AArch64::D8: + case AArch64::D10: + case AArch64::D12: + case AArch64::D14: + case AArch64::D16: + case AArch64::D18: + case AArch64::D20: + case AArch64::D22: + case AArch64::D24: + case AArch64::D26: + case AArch64::D28: + case AArch64::D30: + case AArch64::Q0: + case AArch64::Q2: + case AArch64::Q4: + case AArch64::Q6: + case AArch64::Q8: + case AArch64::Q10: + case AArch64::Q12: + case AArch64::Q14: + case AArch64::Q16: + case AArch64::Q18: + case AArch64::Q20: + case AArch64::Q22: + case AArch64::Q24: + case AArch64::Q26: + case AArch64::Q28: + case AArch64::Q30: + return false; + + } +} + +bool haveSameParity(unsigned reg1, unsigned reg2) { + assert(isFPReg(reg1) && "Expecting an FP register for reg1"); + assert(isFPReg(reg2) && "Expecting an FP register for reg2"); + + return isOdd(reg1) == isOdd(reg2); +} + +} + +bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, + unsigned Ra) { + if (Rd == Ra) + return false; + + LiveIntervals &LIs = G.getMetadata().LIS; + + if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) { + DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd) + << '\n'); + DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra) + << '\n'); + return false; + } + + PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd); + PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra); + + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed = + &G.getNodeMetadata(node1).getAllowedRegs(); + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed = + &G.getNodeMetadata(node2).getAllowedRegs(); + + PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2); + + // The edge does not exist. Create one with the appropriate interference + // costs. + if (edge == G.invalidEdgeId()) { + const LiveInterval &ld = LIs.getInterval(Rd); + const LiveInterval &la = LIs.getInterval(Ra); + bool livesOverlap = ld.overlaps(la); + + PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1, + vRaAllowed->size() + 1, 0); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (livesOverlap && TRI->regsOverlap(pRd, pRa)) + costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity(); + else + costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0; + } + } + G.addEdge(node1, node2, std::move(costs)); + return true; + } + + if (G.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRaAllowed); + } + + // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass)) + PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge)); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for same parity + // registers + PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min(); + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits<PBQP::PBQPNum>::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with a different parity have a higher cost + // than sameParityMax + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + G.updateEdgeCosts(edge, std::move(costs)); + + return true; +} + +void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, + unsigned Ra) { + LiveIntervals &LIs = G.getMetadata().LIS; + + // Do some Chain management + if (Chains.count(Ra)) { + if (Rd != Ra) { + DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to " + << PrintReg(Rd, TRI) << '\n';); + Chains.remove(Ra); + Chains.insert(Rd); + } + } else { + DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI) + << '\n';); + Chains.insert(Rd); + } + + PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd); + + const LiveInterval &ld = LIs.getInterval(Rd); + for (auto r : Chains) { + // Skip self + if (r == Rd) + continue; + + const LiveInterval &lr = LIs.getInterval(r); + if (ld.overlaps(lr)) { + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed = + &G.getNodeMetadata(node1).getAllowedRegs(); + + PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r); + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed = + &G.getNodeMetadata(node2).getAllowedRegs(); + + PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2); + assert(edge != G.invalidEdgeId() && + "PBQP error ! The edge should exist !"); + + DEBUG(dbgs() << "Refining constraint !\n";); + + if (G.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRrAllowed); + } + + // Enforce that cost is higher with all other Chains of the same parity + PBQP::Matrix costs(G.getEdgeCosts(edge)); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for all other + // parity registers + PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min(); + for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits<PBQP::PBQPNum>::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with same parity have a higher cost + // than sameParityMax + for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + G.updateEdgeCosts(edge, std::move(costs)); + } + } +} + +static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg, + const MachineInstr &MI) { + const LiveInterval &LI = LIs.getInterval(reg); + SlotIndex SI = LIs.getInstructionIndex(&MI); + return LI.expiredAt(SI); +} + +void A57ChainingConstraint::apply(PBQPRAGraph &G) { + const MachineFunction &MF = G.getMetadata().MF; + LiveIntervals &LIs = G.getMetadata().LIS; + + TRI = MF.getSubtarget().getRegisterInfo(); + DEBUG(MF.dump()); + + for (const auto &MBB: MF) { + Chains.clear(); // FIXME: really needed ? Could not work at MF level ? + + for (const auto &MI: MBB) { + + // Forget Chains which have expired + for (auto r : Chains) { + SmallVector<unsigned, 8> toDel; + if(regJustKilledBefore(LIs, r, MI)) { + DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at "; + MI.print(dbgs());); + toDel.push_back(r); + } + + while (!toDel.empty()) { + Chains.remove(toDel.back()); + toDel.pop_back(); + } + } + + switch (MI.getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: { + unsigned Rd = MI.getOperand(0).getReg(); + unsigned Ra = MI.getOperand(3).getReg(); + + if (addIntraChainConstraint(G, Rd, Ra)) + addInterChainConstraint(G, Rd, Ra); + break; + } + + case AArch64::FMLAv2f32: + case AArch64::FMLSv2f32: { + unsigned Rd = MI.getOperand(0).getReg(); + addInterChainConstraint(G, Rd, Rd); + break; + } + + default: + break; + } + } + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h new file mode 100644 index 0000000..4f656f9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -0,0 +1,38 @@ +//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" + +namespace llvm { + +/// Add the accumulator chaining constraint to a PBQP graph +class A57ChainingConstraint : public PBQPRAConstraint { +public: + // Add A57 specific constraints to the PBQP graph. + void apply(PBQPRAGraph &G) override; + +private: + SmallSetVector<unsigned, 32> Chains; + const TargetRegisterInfo *TRI; + + // Add the accumulator chaining constraint, inside the chain, i.e. so that + // parity(Rd) == parity(Ra). + // \return true if a constraint was added + bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); + + // Add constraints between existing chains + void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); +}; +} + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h new file mode 100644 index 0000000..9e9eec4 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -0,0 +1,6591 @@ +//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using AdvSIMD instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7> + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2> + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6> + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7> + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6> + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6> + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7> + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7> + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6> + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7> + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6> + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0> + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3> + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7> + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5> + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6> + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2> + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3> + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1> + 1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5> + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7> + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5> + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1> + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7> + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1> + 2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2> + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4> + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6> + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0> + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7> + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7> + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7> + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1> + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1> + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1> + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6> + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2> + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3> + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6> + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7> + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2> + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2> + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3> + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4> + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7> + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7> + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7> + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1> + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3> + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1> + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2> + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0> + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6> + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7> + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2> + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2> + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1> + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1> + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2> + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2> + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3> + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3> + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0> + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7> + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3> + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0> + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1> + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2> + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5> + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7> + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5> + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5> + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6> + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6> + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6> + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6> + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7> + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7> + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1> + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7> + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7> + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0> + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7> + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2> + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1> + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2> + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5> + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6> + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0> + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0> + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1> + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2> + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0> + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3> + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3> + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4> + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3> + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7> + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2> + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6> + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1> + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7> + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1> + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4> + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7> + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0> + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7> + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2> + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1> + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1> + 2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0> + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0> + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3> + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3> + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1> + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5> + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5> + 2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6> + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5> + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3> + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7> + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7> + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7> + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2> + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7> + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1> + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2> + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0> + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3> + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3> + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7> + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5> + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6> + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6> + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6> + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7> + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7> + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1> + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3> + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7> + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2> + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2> + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2> + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1> + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1> + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2> + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3> + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3> + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0> + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0> + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4> + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3> + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0> + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1> + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6> + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5> + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7> + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7> + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0> + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1> + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1> + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5> + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6> + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5> + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6> + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6> + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6> + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3> + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7> + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7> + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1> + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7> + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7> + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7> + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0> + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7> + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1> + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3> + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6> + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5> + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2> + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1> + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2> + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0> + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5> + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6> + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS + 1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS + 1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS + 537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS + 2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS + 1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS + 2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // <u,0,3,2>: Cost 1 vrev LHS + 2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // <u,0,3,u>: Cost 1 vrev LHS + 2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS + 2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5> + 2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS + 2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS + 2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS + 1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS + 2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS + 1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS + 2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS + 1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS + 1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4> + 2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4> + 1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS + 2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7> + 2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS + 1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS + 2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2> + 1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1> + 1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS + 1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS + 408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS + 1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS + 408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS + 1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS + 1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5> + 2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5> + 1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS + 2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS + 408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS + 1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS + 1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS + 1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS + 1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS + 1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS + 1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0> + 2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0> + 471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS + 1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS + 1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2> + 1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS + 2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS + 1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3> + 2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS + 1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS + 1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4> + 471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS + 2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS + 1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS + 2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6> + 1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6> + 1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2> + 471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS + 1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0> + 336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS + 1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6> + 471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS + 1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7> + 1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS + 471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS + 2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS + 2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS + 2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS + 1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS + 1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS + 537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS + 2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS + 1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS + 2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // <u,4,7,6>: Cost 1 vrev RHS + 2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // <u,4,7,u>: Cost 1 vrev RHS + 1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS + 1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS + 2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS + 2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0> + 1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS + 2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS + 2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS + 2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS + 1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS + 1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS + 2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0> + 1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0> + 2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1> + 2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1> + 1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3> + 2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3> + 2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS + 1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS + 2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5> + 1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS + 1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS + 432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS + 1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS + 1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS + 432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS + 432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS + 1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS + 1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS + 1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS + 432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS + 1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS + 1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS + 1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2> + 1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS + 1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS + 2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS + 2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS + 1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6> + 1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS + 1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2> + 497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS + 1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0> + 1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1> + 1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6> + 497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS + 1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7> + 363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS + 497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS + 135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS + 471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS + 1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2> + 1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2> + 1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0> + 471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS + 1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS + 537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS + 1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3> + 537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS + 1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2> + 269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3> + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS + 1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // <u,u,3,2>: Cost 1 vrev LHS + 336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS + 408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS + 1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7> + 1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS + 408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS + 1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6> + 1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4> + 161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS + 471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS + 1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6> + 471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS + 1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7> + 1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS + 537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS + 1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS + 537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS + 1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7> + 1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6> + 296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS + 1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS + 432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS + 1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // <u,u,7,6>: Cost 1 vrev RHS + 363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS + 432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS + 408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS + 471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS + 537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS + 471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS + 537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp new file mode 100644 index 0000000..79c09d9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -0,0 +1,557 @@ +//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64PromoteConstant pass which promotes constants +// to global variables when this is likely to be more efficient. Currently only +// types related to constant vector (i.e., constant vector, array of constant +// vectors, constant structure with a constant vector field, etc.) are promoted +// to global variables. Constant vectors are likely to be lowered in target +// constant pool during instruction selection already; therefore, the access +// will remain the same (memory load), but the structure types are not split +// into different constant pool accesses for each field. A bonus side effect is +// that created globals may be merged by the global merge pass. +// +// FIXME: This pass may be useful for other targets too. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-promote-const" + +// Stress testing mode - disable heuristics. +static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden, + cl::desc("Promote all vector constants")); + +STATISTIC(NumPromoted, "Number of promoted constants"); +STATISTIC(NumPromotedUses, "Number of promoted constants uses"); + +//===----------------------------------------------------------------------===// +// AArch64PromoteConstant +//===----------------------------------------------------------------------===// + +namespace { +/// Promotes interesting constant into global variables. +/// The motivating example is: +/// static const uint16_t TableA[32] = { +/// 41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768, +/// 31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215, +/// 25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846, +/// 21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725, +/// }; +/// +/// uint8x16x4_t LoadStatic(void) { +/// uint8x16x4_t ret; +/// ret.val[0] = vld1q_u16(TableA + 0); +/// ret.val[1] = vld1q_u16(TableA + 8); +/// ret.val[2] = vld1q_u16(TableA + 16); +/// ret.val[3] = vld1q_u16(TableA + 24); +/// return ret; +/// } +/// +/// The constants in this example are folded into the uses. Thus, 4 different +/// constants are created. +/// +/// As their type is vector the cheapest way to create them is to load them +/// for the memory. +/// +/// Therefore the final assembly final has 4 different loads. With this pass +/// enabled, only one load is issued for the constants. +class AArch64PromoteConstant : public ModulePass { + +public: + static char ID; + AArch64PromoteConstant() : ModulePass(ID) {} + + const char *getPassName() const override { return "AArch64 Promote Constant"; } + + /// Iterate over the functions and promote the interesting constants into + /// global variables with module scope. + bool runOnModule(Module &M) override { + DEBUG(dbgs() << getPassName() << '\n'); + bool Changed = false; + for (auto &MF : M) { + Changed |= runOnFunction(MF); + } + return Changed; + } + +private: + /// Look for interesting constants used within the given function. + /// Promote them into global variables, load these global variables within + /// the related function, so that the number of inserted load is minimal. + bool runOnFunction(Function &F); + + // This transformation requires dominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + /// Type to store a list of Uses. + typedef SmallVector<Use *, 4> Uses; + /// Map an insertion point to all the uses it dominates. + typedef DenseMap<Instruction *, Uses> InsertionPoints; + /// Map a function to the required insertion point of load for a + /// global variable. + typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc; + + /// Find the closest point that dominates the given Use. + Instruction *findInsertionPoint(Use &Use); + + /// Check if the given insertion point is dominated by an existing + /// insertion point. + /// If true, the given use is added to the list of dominated uses for + /// the related existing point. + /// \param NewPt the insertion point to be checked + /// \param Use the use to be added into the list of dominated uses + /// \param InsertPts existing insertion points + /// \pre NewPt and all instruction in InsertPts belong to the same function + /// \return true if one of the insertion point in InsertPts dominates NewPt, + /// false otherwise + bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + + /// Check if the given insertion point can be merged with an existing + /// insertion point in a common dominator. + /// If true, the given use is added to the list of the created insertion + /// point. + /// \param NewPt the insertion point to be checked + /// \param Use the use to be added into the list of dominated uses + /// \param InsertPts existing insertion points + /// \pre NewPt and all instruction in InsertPts belong to the same function + /// \pre isDominated returns false for the exact same parameters. + /// \return true if it exists an insertion point in InsertPts that could + /// have been merged with NewPt in a common dominator, + /// false otherwise + bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts); + + /// Compute the minimal insertion points to dominates all the interesting + /// uses of value. + /// Insertion points are group per function and each insertion point + /// contains a list of all the uses it dominates within the related function + /// \param Val constant to be examined + /// \param[out] InsPtsPerFunc output storage of the analysis + void computeInsertionPoints(Constant *Val, + InsertionPointsPerFunc &InsPtsPerFunc); + + /// Insert a definition of a new global variable at each point contained in + /// InsPtsPerFunc and update the related uses (also contained in + /// InsPtsPerFunc). + bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc); + + /// Compute the minimal insertion points to dominate all the interesting + /// uses of Val and insert a definition of a new global variable + /// at these points. + /// Also update the uses of Val accordingly. + /// Currently a use of Val is considered interesting if: + /// - Val is not UndefValue + /// - Val is not zeroinitialized + /// - Replacing Val per a load of a global variable is valid. + /// \see shouldConvert for more details + bool computeAndInsertDefinitions(Constant *Val); + + /// Promote the given constant into a global variable if it is expected to + /// be profitable. + /// \return true if Cst has been promoted + bool promoteConstant(Constant *Cst); + + /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. + /// Append Use to this list and delete the entry of IPI in InsertPts. + static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use, + InsertionPoints::iterator &IPI, + InsertionPoints &InsertPts) { + // Record the dominated use. + IPI->second.push_back(&Use); + // Transfer the dominated uses of IPI to NewPt + // Inserting into the DenseMap may invalidate existing iterator. + // Keep a copy of the key to find the iterator to erase. Keep a copy of the + // value so that we don't have to dereference IPI->second. + Instruction *OldInstr = IPI->first; + Uses OldUses = std::move(IPI->second); + InsertPts[NewPt] = std::move(OldUses); + // Erase IPI. + InsertPts.erase(OldInstr); + } +}; +} // end anonymous namespace + +char AArch64PromoteConstant::ID = 0; + +namespace llvm { +void initializeAArch64PromoteConstantPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const", + "AArch64 Promote Constant Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const", + "AArch64 Promote Constant Pass", false, false) + +ModulePass *llvm::createAArch64PromoteConstantPass() { + return new AArch64PromoteConstant(); +} + +/// Check if the given type uses a vector type. +static bool isConstantUsingVectorTy(const Type *CstTy) { + if (CstTy->isVectorTy()) + return true; + if (CstTy->isStructTy()) { + for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements(); + EltIdx < EndEltIdx; ++EltIdx) + if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx))) + return true; + } else if (CstTy->isArrayTy()) + return isConstantUsingVectorTy(CstTy->getArrayElementType()); + return false; +} + +/// Check if the given use (Instruction + OpIdx) of Cst should be converted into +/// a load of a global variable initialized with Cst. +/// A use should be converted if it is legal to do so. +/// For instance, it is not legal to turn the mask operand of a shuffle vector +/// into a load of a global variable. +static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, + unsigned OpIdx) { + // shufflevector instruction expects a const for the mask argument, i.e., the + // third argument. Do not promote this use in that case. + if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2) + return false; + + // extractvalue instruction expects a const idx. + if (isa<const ExtractValueInst>(Instr) && OpIdx > 0) + return false; + + // extractvalue instruction expects a const idx. + if (isa<const InsertValueInst>(Instr) && OpIdx > 1) + return false; + + if (isa<const AllocaInst>(Instr) && OpIdx > 0) + return false; + + // Alignment argument must be constant. + if (isa<const LoadInst>(Instr) && OpIdx > 0) + return false; + + // Alignment argument must be constant. + if (isa<const StoreInst>(Instr) && OpIdx > 1) + return false; + + // Index must be constant. + if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0) + return false; + + // Personality function and filters must be constant. + // Give up on that instruction. + if (isa<const LandingPadInst>(Instr)) + return false; + + // Switch instruction expects constants to compare to. + if (isa<const SwitchInst>(Instr)) + return false; + + // Expected address must be a constant. + if (isa<const IndirectBrInst>(Instr)) + return false; + + // Do not mess with intrinsics. + if (isa<const IntrinsicInst>(Instr)) + return false; + + // Do not mess with inline asm. + const CallInst *CI = dyn_cast<const CallInst>(Instr); + if (CI && isa<const InlineAsm>(CI->getCalledValue())) + return false; + + return true; +} + +/// Check if the given Cst should be converted into +/// a load of a global variable initialized with Cst. +/// A constant should be converted if it is likely that the materialization of +/// the constant will be tricky. Thus, we give up on zero or undef values. +/// +/// \todo Currently, accept only vector related types. +/// Also we give up on all simple vector type to keep the existing +/// behavior. Otherwise, we should push here all the check of the lowering of +/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging +/// constant via global merge and the fact that the same constant is stored +/// only once with this method (versus, as many function that uses the constant +/// for the regular approach, even for float). +/// Again, the simplest solution would be to promote every +/// constant and rematerialize them when they are actually cheap to create. +static bool shouldConvert(const Constant *Cst) { + if (isa<const UndefValue>(Cst)) + return false; + + // FIXME: In some cases, it may be interesting to promote in memory + // a zero initialized constant. + // E.g., when the type of Cst require more instructions than the + // adrp/add/load sequence or when this sequence can be shared by several + // instances of Cst. + // Ideally, we could promote this into a global and rematerialize the constant + // when it was a bad idea. + if (Cst->isZeroValue()) + return false; + + if (Stress) + return true; + + // FIXME: see function \todo + if (Cst->getType()->isVectorTy()) + return false; + return isConstantUsingVectorTy(Cst->getType()); +} + +Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) { + Instruction *User = cast<Instruction>(Use.getUser()); + + // If this user is a phi, the insertion point is in the related + // incoming basic block. + if (PHINode *PhiInst = dyn_cast<PHINode>(User)) + return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); + + return User; +} + +bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use, + InsertionPoints &InsertPts) { + + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( + *NewPt->getParent()->getParent()).getDomTree(); + + // Traverse all the existing insertion points and check if one is dominating + // NewPt. If it is, remember that. + for (auto &IPI : InsertPts) { + if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) || + // When IPI.first is a terminator instruction, DT may think that + // the result is defined on the edge. + // Here we are testing the insertion point, not the definition. + (IPI.first->getParent() != NewPt->getParent() && + DT.dominates(IPI.first->getParent(), NewPt->getParent()))) { + // No need to insert this point. Just record the dominated use. + DEBUG(dbgs() << "Insertion point dominated by:\n"); + DEBUG(IPI.first->print(dbgs())); + DEBUG(dbgs() << '\n'); + IPI.second.push_back(&Use); + return true; + } + } + return false; +} + +bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use, + InsertionPoints &InsertPts) { + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( + *NewPt->getParent()->getParent()).getDomTree(); + BasicBlock *NewBB = NewPt->getParent(); + + // Traverse all the existing insertion point and check if one is dominated by + // NewPt and thus useless or can be combined with NewPt into a common + // dominator. + for (InsertionPoints::iterator IPI = InsertPts.begin(), + EndIPI = InsertPts.end(); + IPI != EndIPI; ++IPI) { + BasicBlock *CurBB = IPI->first->getParent(); + if (NewBB == CurBB) { + // Instructions are in the same block. + // By construction, NewPt is dominating the other. + // Indeed, isDominated returned false with the exact same arguments. + DEBUG(dbgs() << "Merge insertion point with:\n"); + DEBUG(IPI->first->print(dbgs())); + DEBUG(dbgs() << "\nat considered insertion point.\n"); + appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + return true; + } + + // Look for a common dominator + BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB); + // If none exists, we cannot merge these two points. + if (!CommonDominator) + continue; + + if (CommonDominator != NewBB) { + // By construction, the CommonDominator cannot be CurBB. + assert(CommonDominator != CurBB && + "Instruction has not been rejected during isDominated check!"); + // Take the last instruction of the CommonDominator as insertion point + NewPt = CommonDominator->getTerminator(); + } + // else, CommonDominator is the block of NewBB, hence NewBB is the last + // possible insertion point in that block. + DEBUG(dbgs() << "Merge insertion point with:\n"); + DEBUG(IPI->first->print(dbgs())); + DEBUG(dbgs() << '\n'); + DEBUG(NewPt->print(dbgs())); + DEBUG(dbgs() << '\n'); + appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts); + return true; + } + return false; +} + +void AArch64PromoteConstant::computeInsertionPoints( + Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { + DEBUG(dbgs() << "** Compute insertion points **\n"); + for (Use &Use : Val->uses()) { + Instruction *User = dyn_cast<Instruction>(Use.getUser()); + + // If the user is not an Instruction, we cannot modify it. + if (!User) + continue; + + // Filter out uses that should not be converted. + if (!shouldConvertUse(Val, User, Use.getOperandNo())) + continue; + + DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n"); + DEBUG(User->print(dbgs())); + DEBUG(dbgs() << '\n'); + + Instruction *InsertionPoint = findInsertionPoint(Use); + + DEBUG(dbgs() << "Considered insertion point:\n"); + DEBUG(InsertionPoint->print(dbgs())); + DEBUG(dbgs() << '\n'); + + // Check if the current insertion point is useless, i.e., it is dominated + // by another one. + InsertionPoints &InsertPts = + InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; + if (isDominated(InsertionPoint, Use, InsertPts)) + continue; + // This insertion point is useful, check if we can merge some insertion + // point in a common dominator or if NewPt dominates an existing one. + if (tryAndMerge(InsertionPoint, Use, InsertPts)) + continue; + + DEBUG(dbgs() << "Keep considered insertion point\n"); + + // It is definitely useful by its own + InsertPts[InsertionPoint].push_back(&Use); + } +} + +bool AArch64PromoteConstant::insertDefinitions( + Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) { + // We will create one global variable per Module. + DenseMap<Module *, GlobalVariable *> ModuleToMergedGV; + bool HasChanged = false; + + // Traverse all insertion points in all the function. + for (const auto &FctToInstPtsIt : InsPtsPerFunc) { + const InsertionPoints &InsertPts = FctToInstPtsIt.second; +// Do more checking for debug purposes. +#ifndef NDEBUG + DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>( + *FctToInstPtsIt.first).getDomTree(); +#endif + assert(!InsertPts.empty() && "Empty uses does not need a definition"); + + Module *M = FctToInstPtsIt.first->getParent(); + GlobalVariable *&PromotedGV = ModuleToMergedGV[M]; + if (!PromotedGV) { + PromotedGV = new GlobalVariable( + *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, + "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); + PromotedGV->setInitializer(Cst); + DEBUG(dbgs() << "Global replacement: "); + DEBUG(PromotedGV->print(dbgs())); + DEBUG(dbgs() << '\n'); + ++NumPromoted; + HasChanged = true; + } + + for (const auto &IPI : InsertPts) { + // Create the load of the global variable. + IRBuilder<> Builder(IPI.first); + LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); + DEBUG(dbgs() << "**********\n"); + DEBUG(dbgs() << "New def: "); + DEBUG(LoadedCst->print(dbgs())); + DEBUG(dbgs() << '\n'); + + // Update the dominated uses. + for (Use *Use : IPI.second) { +#ifndef NDEBUG + assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) && + "Inserted definition does not dominate all its uses!"); +#endif + DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":"); + DEBUG(Use->getUser()->print(dbgs())); + DEBUG(dbgs() << '\n'); + Use->set(LoadedCst); + ++NumPromotedUses; + } + } + } + return HasChanged; +} + +bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) { + InsertionPointsPerFunc InsertPtsPerFunc; + computeInsertionPoints(Val, InsertPtsPerFunc); + return insertDefinitions(Val, InsertPtsPerFunc); +} + +bool AArch64PromoteConstant::promoteConstant(Constant *Cst) { + assert(Cst && "Given variable is not a valid constant."); + + if (!shouldConvert(Cst)) + return false; + + DEBUG(dbgs() << "******************************\n"); + DEBUG(dbgs() << "Candidate constant: "); + DEBUG(Cst->print(dbgs())); + DEBUG(dbgs() << '\n'); + + return computeAndInsertDefinitions(Cst); +} + +bool AArch64PromoteConstant::runOnFunction(Function &F) { + // Look for instructions using constant vector. Promote that constant to a + // global variable. Create as few loads of this variable as possible and + // update the uses accordingly. + bool LocalChange = false; + SmallPtrSet<Constant *, 8> AlreadyChecked; + + for (Instruction &I : instructions(&F)) { + // Traverse the operand, looking for constant vectors. Replace them by a + // load of a global variable of constant vector type. + for (Value *Op : I.operand_values()) { + Constant *Cst = dyn_cast<Constant>(Op); + // There is no point in promoting global values as they are already + // global. Do not promote constant expressions either, as they may + // require some code expansion. + if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) && + AlreadyChecked.insert(Cst).second) + LocalChange |= promoteConstant(Cst); + } + } + return LocalChange; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp new file mode 100644 index 0000000..32b4888 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -0,0 +1,441 @@ +//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterInfo.h" +#include "AArch64FrameLowering.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +#define GET_REGINFO_TARGET_DESC +#include "AArch64GenRegisterInfo.inc" + +AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) + : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} + +const MCPhysReg * +AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::GHC) + // GHC set of callee saved regs is empty as all those regs are + // used for passing STG regs around + return CSR_AArch64_NoRegs_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) + return CSR_AArch64_AllRegs_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; + else + return CSR_AArch64_AAPCS_SaveList; +} + +const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + +const uint32_t * +AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + if (CC == CallingConv::GHC) + // This is academic becase all GHC calls are (supposed to be) tail calls + return CSR_AArch64_NoRegs_RegMask; + if (CC == CallingConv::AnyReg) + return CSR_AArch64_AllRegs_RegMask; + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_AArch64_CXX_TLS_Darwin_RegMask; + else + return CSR_AArch64_AAPCS_RegMask; +} + +const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { + if (TT.isOSDarwin()) + return CSR_AArch64_TLS_Darwin_RegMask; + + assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS"); + return CSR_AArch64_TLS_ELF_RegMask; +} + +const uint32_t * +AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + // This should return a register mask that is the same as that returned by + // getCallPreservedMask but that additionally preserves the register used for + // the first i64 argument (which must also be the register used to return a + // single i64 return value) + // + // In case that the calling convention does not use the same register for + // both, the function should return NULL (does not currently apply) + assert(CC != CallingConv::GHC && "should not be GHC calling convention."); + return CSR_AArch64_AAPCS_ThisReturn_RegMask; +} + +BitVector +AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + const AArch64FrameLowering *TFI = getFrameLowering(MF); + + // FIXME: avoid re-calculating this every time. + BitVector Reserved(getNumRegs()); + Reserved.set(AArch64::SP); + Reserved.set(AArch64::XZR); + Reserved.set(AArch64::WSP); + Reserved.set(AArch64::WZR); + + if (TFI->hasFP(MF) || TT.isOSDarwin()) { + Reserved.set(AArch64::FP); + Reserved.set(AArch64::W29); + } + + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { + Reserved.set(AArch64::X18); // Platform register + Reserved.set(AArch64::W18); + } + + if (hasBasePointer(MF)) { + Reserved.set(AArch64::X19); + Reserved.set(AArch64::W19); + } + + return Reserved; +} + +bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, + unsigned Reg) const { + const AArch64FrameLowering *TFI = getFrameLowering(MF); + + switch (Reg) { + default: + break; + case AArch64::SP: + case AArch64::XZR: + case AArch64::WSP: + case AArch64::WZR: + return true; + case AArch64::X18: + case AArch64::W18: + return MF.getSubtarget<AArch64Subtarget>().isX18Reserved(); + case AArch64::FP: + case AArch64::W29: + return TFI->hasFP(MF) || TT.isOSDarwin(); + case AArch64::W19: + case AArch64::X19: + return hasBasePointer(MF); + } + + return false; +} + +const TargetRegisterClass * +AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + return &AArch64::GPR64RegClass; +} + +const TargetRegisterClass * +AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &AArch64::CCRRegClass) + return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV. + return RC; +} + +unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } + +bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // In the presence of variable sized objects, if the fixed stack size is + // large enough that referencing from the FP won't result in things being + // in range relatively often, we can use a base pointer to allow access + // from the other direction like the SP normally works. + // Furthermore, if both variable sized objects are present, and the + // stack needs to be dynamically re-aligned, the base pointer is the only + // reliable way to reference the locals. + if (MFI->hasVarSizedObjects()) { + if (needsStackRealignment(MF)) + return true; + // Conservatively estimate whether the negative offset from the frame + // pointer will be sufficient to reach. If a function has a smallish + // frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, we'll materialize the constant and still get to the + // object; it's just suboptimal. Negative offsets use the unscaled + // load/store instructions, which have a 9-bit signed immediate. + if (MFI->getLocalFrameSize() < 256) + return false; + return true; + } + + return false; +} + +unsigned +AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const AArch64FrameLowering *TFI = getFrameLowering(MF); + return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP; +} + +bool AArch64RegisterInfo::requiresRegisterScavenging( + const MachineFunction &MF) const { + return true; +} + +bool AArch64RegisterInfo::requiresVirtualBaseRegisters( + const MachineFunction &MF) const { + return true; +} + +bool +AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // AArch64FrameLowering::resolveFrameIndexReference() can always fall back + // to the stack pointer, so only put the emergency spill slot next to the + // FP when there's no better way to access it (SP or base pointer). + return MFI->hasVarSizedObjects() && !hasBasePointer(MF); +} + +bool AArch64RegisterInfo::requiresFrameIndexScavenging( + const MachineFunction &MF) const { + return true; +} + +bool +AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Only consider eliminating leaf frames. + if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) && + MFI->adjustsStack())) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); +} + +/// needsFrameBaseReg - Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, + int64_t Offset) const { + for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) + assert(i < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores, so + // return false for everything else. + if (!MI->mayLoad() && !MI->mayStore()) + return false; + + // Without a virtual base register, if the function has variable sized + // objects, all fixed-size local references will be via the frame pointer, + // Approximate the offset and see if it's legal for the instruction. + // Note that the incoming offset is based on the SP value at function entry, + // so it'll be negative. + MachineFunction &MF = *MI->getParent()->getParent(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Estimate an offset from the frame pointer. + // Conservatively assume all GPR callee-saved registers get pushed. + // FP, LR, X19-X28, D8-D15. 64-bits each. + int64_t FPOffset = Offset - 16 * 20; + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset += MFI->getLocalFrameSize(); + // Assume that we'll have at least some spill slots allocated. + // FIXME: This is a total SWAG number. We should run some statistics + // and pick a real one. + Offset += 128; // 128 bytes of spill slots + + // If there is a frame pointer, try using it. + // The FP is only available if there is no dynamic realignment. We + // don't know for sure yet whether we'll need that, so we guess based + // on whether there are any local variables that would trigger it. + if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, AArch64::FP, FPOffset)) + return false; + + // If we can reference via the stack pointer or base pointer, try that. + // FIXME: This (and the code that resolves the references) can be improved + // to only disallow SP relative references in the live range of + // the VLA(s). In practice, it's unclear how much difference that + // would make, but it may be worth doing. + if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) + return false; + + // The offset likely isn't legal; we want to allocate a virtual base register. + return true; +} + +bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, + int64_t Offset) const { + assert(Offset <= INT_MAX && "Offset too big to fit in int."); + assert(MI && "Unable to get the legal offset for nil instruction."); + int SaveOffset = Offset; + return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; +} + +/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx +/// at the beginning of the basic block. +void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, + int FrameIdx, + int64_t Offset) const { + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + const MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); + const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF)); + unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); + + BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx) + .addImm(Offset) + .addImm(Shifter); +} + +void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + const MachineFunction *MF = MI.getParent()->getParent(); + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII); + assert(Done && "Unable to resolve frame index!"); + (void)Done; +} + +void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); + const AArch64FrameLowering *TFI = getFrameLowering(MF); + + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + unsigned FrameReg; + int Offset; + + // Special handling of dbg_value, stackmap and patchpoint instructions. + if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || + MI.getOpcode() == TargetOpcode::PATCHPOINT) { + Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, + /*PreferFP=*/true); + Offset += MI.getOperand(FIOperandNum + 1).getImm(); + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + return; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); + if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) + return; + + assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) && + "Emergency spill slot is out of reach"); + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above. Handle the rest, providing a register that is + // SP+LargeImm. + unsigned ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); + MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); +} + +namespace llvm { + +unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const AArch64FrameLowering *TFI = getFrameLowering(MF); + + switch (RC->getID()) { + default: + return 0; + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR32commonRegClassID: + case AArch64::GPR64commonRegClassID: + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP + - MF.getSubtarget<AArch64Subtarget>() + .isX18Reserved() // X18 reserved as platform register + - hasBasePointer(MF); // X19 + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR128RegClassID: + return 32; + + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return 32; + + case AArch64::FPR128_loRegClassID: + return 16; + } +} + +} // namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h new file mode 100644 index 0000000..f33f788 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -0,0 +1,102 @@ +//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H + +#define GET_REGINFO_HEADER +#include "AArch64GenRegisterInfo.inc" + +namespace llvm { + +class MachineFunction; +class RegScavenger; +class TargetRegisterClass; +class Triple; + +struct AArch64RegisterInfo : public AArch64GenRegisterInfo { +private: + const Triple &TT; + +public: + AArch64RegisterInfo(const Triple &TT); + + bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + /// Code Generation virtual methods... + const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + + unsigned getCSRFirstUseCost() const override { + // The cost will be compared against BlockFrequency where entry has the + // value of 1 << 14. A value of 5 will choose to spill or split really + // cold path instead of using a callee-saved register. + return 5; + } + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getTLSCallPreservedMask() const; + + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the + /// case that 'returned' is on an i64 first argument if the calling convention + /// is one that can (partially) model this attribute with a preserved mask + /// (i.e. it is a calling convention that uses the same register for the first + /// i64 argument and an i64 return value) + /// + /// Should return NULL in the case that the calling convention does not have + /// this property + const uint32_t *getThisReturnPreservedMask(const MachineFunction &MF, + CallingConv::ID) const; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + + bool requiresRegisterScavenging(const MachineFunction &MF) const override; + bool useFPForScavengingIndex(const MachineFunction &MF) const override; + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const override; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, + int FrameIdx, + int64_t Offset) const override; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + bool cannotEliminateFrame(const MachineFunction &MF) const; + + bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; + bool hasBasePointer(const MachineFunction &MF) const; + unsigned getBaseRegister() const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td new file mode 100644 index 0000000..a8c8b17 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -0,0 +1,635 @@ +//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + + +class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [], + list<string> altNames = []> + : Register<n, altNames> { + let HWEncoding = enc; + let Namespace = "AArch64"; + let SubRegs = subregs; +} + +let Namespace = "AArch64" in { + def sub_32 : SubRegIndex<32>; + + def bsub : SubRegIndex<8>; + def hsub : SubRegIndex<16>; + def ssub : SubRegIndex<32>; + def dsub : SubRegIndex<32>; + def sube32 : SubRegIndex<32>; + def subo32 : SubRegIndex<32>; + def qhisub : SubRegIndex<64>; + def qsub : SubRegIndex<64>; + def sube64 : SubRegIndex<64>; + def subo64 : SubRegIndex<64>; + // Note: Code depends on these having consecutive numbers + def dsub0 : SubRegIndex<64>; + def dsub1 : SubRegIndex<64>; + def dsub2 : SubRegIndex<64>; + def dsub3 : SubRegIndex<64>; + // Note: Code depends on these having consecutive numbers + def qsub0 : SubRegIndex<128>; + def qsub1 : SubRegIndex<128>; + def qsub2 : SubRegIndex<128>; + def qsub3 : SubRegIndex<128>; +} + +let Namespace = "AArch64" in { + def vreg : RegAltNameIndex; + def vlist1 : RegAltNameIndex; +} + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// +def W0 : AArch64Reg<0, "w0" >, DwarfRegNum<[0]>; +def W1 : AArch64Reg<1, "w1" >, DwarfRegNum<[1]>; +def W2 : AArch64Reg<2, "w2" >, DwarfRegNum<[2]>; +def W3 : AArch64Reg<3, "w3" >, DwarfRegNum<[3]>; +def W4 : AArch64Reg<4, "w4" >, DwarfRegNum<[4]>; +def W5 : AArch64Reg<5, "w5" >, DwarfRegNum<[5]>; +def W6 : AArch64Reg<6, "w6" >, DwarfRegNum<[6]>; +def W7 : AArch64Reg<7, "w7" >, DwarfRegNum<[7]>; +def W8 : AArch64Reg<8, "w8" >, DwarfRegNum<[8]>; +def W9 : AArch64Reg<9, "w9" >, DwarfRegNum<[9]>; +def W10 : AArch64Reg<10, "w10">, DwarfRegNum<[10]>; +def W11 : AArch64Reg<11, "w11">, DwarfRegNum<[11]>; +def W12 : AArch64Reg<12, "w12">, DwarfRegNum<[12]>; +def W13 : AArch64Reg<13, "w13">, DwarfRegNum<[13]>; +def W14 : AArch64Reg<14, "w14">, DwarfRegNum<[14]>; +def W15 : AArch64Reg<15, "w15">, DwarfRegNum<[15]>; +def W16 : AArch64Reg<16, "w16">, DwarfRegNum<[16]>; +def W17 : AArch64Reg<17, "w17">, DwarfRegNum<[17]>; +def W18 : AArch64Reg<18, "w18">, DwarfRegNum<[18]>; +def W19 : AArch64Reg<19, "w19">, DwarfRegNum<[19]>; +def W20 : AArch64Reg<20, "w20">, DwarfRegNum<[20]>; +def W21 : AArch64Reg<21, "w21">, DwarfRegNum<[21]>; +def W22 : AArch64Reg<22, "w22">, DwarfRegNum<[22]>; +def W23 : AArch64Reg<23, "w23">, DwarfRegNum<[23]>; +def W24 : AArch64Reg<24, "w24">, DwarfRegNum<[24]>; +def W25 : AArch64Reg<25, "w25">, DwarfRegNum<[25]>; +def W26 : AArch64Reg<26, "w26">, DwarfRegNum<[26]>; +def W27 : AArch64Reg<27, "w27">, DwarfRegNum<[27]>; +def W28 : AArch64Reg<28, "w28">, DwarfRegNum<[28]>; +def W29 : AArch64Reg<29, "w29">, DwarfRegNum<[29]>; +def W30 : AArch64Reg<30, "w30">, DwarfRegNum<[30]>; +def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>; +def WZR : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>; + +let SubRegIndices = [sub_32] in { +def X0 : AArch64Reg<0, "x0", [W0]>, DwarfRegAlias<W0>; +def X1 : AArch64Reg<1, "x1", [W1]>, DwarfRegAlias<W1>; +def X2 : AArch64Reg<2, "x2", [W2]>, DwarfRegAlias<W2>; +def X3 : AArch64Reg<3, "x3", [W3]>, DwarfRegAlias<W3>; +def X4 : AArch64Reg<4, "x4", [W4]>, DwarfRegAlias<W4>; +def X5 : AArch64Reg<5, "x5", [W5]>, DwarfRegAlias<W5>; +def X6 : AArch64Reg<6, "x6", [W6]>, DwarfRegAlias<W6>; +def X7 : AArch64Reg<7, "x7", [W7]>, DwarfRegAlias<W7>; +def X8 : AArch64Reg<8, "x8", [W8]>, DwarfRegAlias<W8>; +def X9 : AArch64Reg<9, "x9", [W9]>, DwarfRegAlias<W9>; +def X10 : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>; +def X11 : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>; +def X12 : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>; +def X13 : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>; +def X14 : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>; +def X15 : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>; +def X16 : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>; +def X17 : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>; +def X18 : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>; +def X19 : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>; +def X20 : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>; +def X21 : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>; +def X22 : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>; +def X23 : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>; +def X24 : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>; +def X25 : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>; +def X26 : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>; +def X27 : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>; +def X28 : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>; +def FP : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>; +def LR : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>; +def SP : AArch64Reg<31, "sp", [WSP]>, DwarfRegAlias<WSP>; +def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>; +} + +// Condition code register. +def NZCV : AArch64Reg<0, "nzcv">; + +// GPR register classes with the intersections of GPR32/GPR32sp and +// GPR64/GPR64sp for use by the coalescer. +def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> { + let AltOrders = [(rotl GPR32common, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64common : RegisterClass<"AArch64", [i64], 64, + (add (sequence "X%u", 0, 28), FP, LR)> { + let AltOrders = [(rotl GPR64common, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +// GPR register classes which exclude SP/WSP. +def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> { + let AltOrders = [(rotl GPR32, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> { + let AltOrders = [(rotl GPR64, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +// GPR register classes which include SP/WSP. +def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> { + let AltOrders = [(rotl GPR32sp, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> { + let AltOrders = [(rotl GPR64sp, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>; +def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>; + +def GPR64spPlus0Operand : AsmOperandClass { + let Name = "GPR64sp0"; + let RenderMethod = "addRegOperands"; + let ParserMethod = "tryParseGPR64sp0Operand"; +} + +def GPR64sp0 : RegisterOperand<GPR64sp> { + let ParserMatchClass = GPR64spPlus0Operand; +} + +// GPR register classes which include WZR/XZR AND SP/WSP. This is not a +// constraint used by any instructions, it is used as a common super-class. +def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>; +def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>; + +// For tail calls, we can't use callee-saved registers, as they are restored +// to the saved value before the tail call, which would clobber a call address. +// This is for indirect tail calls to store the address of the destination. +def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21, + X22, X23, X24, X25, X26, + X27, X28, FP, LR)>; + +// GPR register classes for post increment amount of vector load/store that +// has alternate printing when Rm=31 and prints a constant immediate value +// equal to the total number of bytes transferred. + +// FIXME: TableGen *should* be able to do these itself now. There appears to be +// a bug in counting how many operands a Post-indexed MCInst should have which +// means the aliases don't trigger. +def GPR64pi1 : RegisterOperand<GPR64, "printPostIncOperand<1>">; +def GPR64pi2 : RegisterOperand<GPR64, "printPostIncOperand<2>">; +def GPR64pi3 : RegisterOperand<GPR64, "printPostIncOperand<3>">; +def GPR64pi4 : RegisterOperand<GPR64, "printPostIncOperand<4>">; +def GPR64pi6 : RegisterOperand<GPR64, "printPostIncOperand<6>">; +def GPR64pi8 : RegisterOperand<GPR64, "printPostIncOperand<8>">; +def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">; +def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">; +def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">; +def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">; +def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">; +def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">; + +// Condition code regclass. +def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { + let CopyCost = -1; // Don't allow copying of status registers. + + // CCR is not allocatable. + let isAllocatable = 0; +} + +//===----------------------------------------------------------------------===// +// Floating Point Scalar Registers +//===----------------------------------------------------------------------===// + +def B0 : AArch64Reg<0, "b0">, DwarfRegNum<[64]>; +def B1 : AArch64Reg<1, "b1">, DwarfRegNum<[65]>; +def B2 : AArch64Reg<2, "b2">, DwarfRegNum<[66]>; +def B3 : AArch64Reg<3, "b3">, DwarfRegNum<[67]>; +def B4 : AArch64Reg<4, "b4">, DwarfRegNum<[68]>; +def B5 : AArch64Reg<5, "b5">, DwarfRegNum<[69]>; +def B6 : AArch64Reg<6, "b6">, DwarfRegNum<[70]>; +def B7 : AArch64Reg<7, "b7">, DwarfRegNum<[71]>; +def B8 : AArch64Reg<8, "b8">, DwarfRegNum<[72]>; +def B9 : AArch64Reg<9, "b9">, DwarfRegNum<[73]>; +def B10 : AArch64Reg<10, "b10">, DwarfRegNum<[74]>; +def B11 : AArch64Reg<11, "b11">, DwarfRegNum<[75]>; +def B12 : AArch64Reg<12, "b12">, DwarfRegNum<[76]>; +def B13 : AArch64Reg<13, "b13">, DwarfRegNum<[77]>; +def B14 : AArch64Reg<14, "b14">, DwarfRegNum<[78]>; +def B15 : AArch64Reg<15, "b15">, DwarfRegNum<[79]>; +def B16 : AArch64Reg<16, "b16">, DwarfRegNum<[80]>; +def B17 : AArch64Reg<17, "b17">, DwarfRegNum<[81]>; +def B18 : AArch64Reg<18, "b18">, DwarfRegNum<[82]>; +def B19 : AArch64Reg<19, "b19">, DwarfRegNum<[83]>; +def B20 : AArch64Reg<20, "b20">, DwarfRegNum<[84]>; +def B21 : AArch64Reg<21, "b21">, DwarfRegNum<[85]>; +def B22 : AArch64Reg<22, "b22">, DwarfRegNum<[86]>; +def B23 : AArch64Reg<23, "b23">, DwarfRegNum<[87]>; +def B24 : AArch64Reg<24, "b24">, DwarfRegNum<[88]>; +def B25 : AArch64Reg<25, "b25">, DwarfRegNum<[89]>; +def B26 : AArch64Reg<26, "b26">, DwarfRegNum<[90]>; +def B27 : AArch64Reg<27, "b27">, DwarfRegNum<[91]>; +def B28 : AArch64Reg<28, "b28">, DwarfRegNum<[92]>; +def B29 : AArch64Reg<29, "b29">, DwarfRegNum<[93]>; +def B30 : AArch64Reg<30, "b30">, DwarfRegNum<[94]>; +def B31 : AArch64Reg<31, "b31">, DwarfRegNum<[95]>; + +let SubRegIndices = [bsub] in { +def H0 : AArch64Reg<0, "h0", [B0]>, DwarfRegAlias<B0>; +def H1 : AArch64Reg<1, "h1", [B1]>, DwarfRegAlias<B1>; +def H2 : AArch64Reg<2, "h2", [B2]>, DwarfRegAlias<B2>; +def H3 : AArch64Reg<3, "h3", [B3]>, DwarfRegAlias<B3>; +def H4 : AArch64Reg<4, "h4", [B4]>, DwarfRegAlias<B4>; +def H5 : AArch64Reg<5, "h5", [B5]>, DwarfRegAlias<B5>; +def H6 : AArch64Reg<6, "h6", [B6]>, DwarfRegAlias<B6>; +def H7 : AArch64Reg<7, "h7", [B7]>, DwarfRegAlias<B7>; +def H8 : AArch64Reg<8, "h8", [B8]>, DwarfRegAlias<B8>; +def H9 : AArch64Reg<9, "h9", [B9]>, DwarfRegAlias<B9>; +def H10 : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>; +def H11 : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>; +def H12 : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>; +def H13 : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>; +def H14 : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>; +def H15 : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>; +def H16 : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>; +def H17 : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>; +def H18 : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>; +def H19 : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>; +def H20 : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>; +def H21 : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>; +def H22 : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>; +def H23 : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>; +def H24 : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>; +def H25 : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>; +def H26 : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>; +def H27 : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>; +def H28 : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>; +def H29 : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>; +def H30 : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>; +def H31 : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>; +} + +let SubRegIndices = [hsub] in { +def S0 : AArch64Reg<0, "s0", [H0]>, DwarfRegAlias<B0>; +def S1 : AArch64Reg<1, "s1", [H1]>, DwarfRegAlias<B1>; +def S2 : AArch64Reg<2, "s2", [H2]>, DwarfRegAlias<B2>; +def S3 : AArch64Reg<3, "s3", [H3]>, DwarfRegAlias<B3>; +def S4 : AArch64Reg<4, "s4", [H4]>, DwarfRegAlias<B4>; +def S5 : AArch64Reg<5, "s5", [H5]>, DwarfRegAlias<B5>; +def S6 : AArch64Reg<6, "s6", [H6]>, DwarfRegAlias<B6>; +def S7 : AArch64Reg<7, "s7", [H7]>, DwarfRegAlias<B7>; +def S8 : AArch64Reg<8, "s8", [H8]>, DwarfRegAlias<B8>; +def S9 : AArch64Reg<9, "s9", [H9]>, DwarfRegAlias<B9>; +def S10 : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>; +def S11 : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>; +def S12 : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>; +def S13 : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>; +def S14 : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>; +def S15 : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>; +def S16 : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>; +def S17 : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>; +def S18 : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>; +def S19 : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>; +def S20 : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>; +def S21 : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>; +def S22 : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>; +def S23 : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>; +def S24 : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>; +def S25 : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>; +def S26 : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>; +def S27 : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>; +def S28 : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>; +def S29 : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>; +def S30 : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>; +def S31 : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>; +} + +let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in { +def D0 : AArch64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>; +def D1 : AArch64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>; +def D2 : AArch64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>; +def D3 : AArch64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>; +def D4 : AArch64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>; +def D5 : AArch64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>; +def D6 : AArch64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>; +def D7 : AArch64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>; +def D8 : AArch64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>; +def D9 : AArch64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>; +def D10 : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>; +def D11 : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>; +def D12 : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>; +def D13 : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>; +def D14 : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>; +def D15 : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>; +def D16 : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>; +def D17 : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>; +def D18 : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>; +def D19 : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>; +def D20 : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>; +def D21 : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>; +def D22 : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>; +def D23 : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>; +def D24 : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>; +def D25 : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>; +def D26 : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>; +def D27 : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>; +def D28 : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>; +def D29 : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>; +def D30 : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>; +def D31 : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>; +} + +let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in { +def Q0 : AArch64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>; +def Q1 : AArch64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>; +def Q2 : AArch64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>; +def Q3 : AArch64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>; +def Q4 : AArch64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>; +def Q5 : AArch64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>; +def Q6 : AArch64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>; +def Q7 : AArch64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>; +def Q8 : AArch64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>; +def Q9 : AArch64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>; +def Q10 : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>; +def Q11 : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>; +def Q12 : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>; +def Q13 : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>; +def Q14 : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>; +def Q15 : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>; +def Q16 : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>; +def Q17 : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>; +def Q18 : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>; +def Q19 : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>; +def Q20 : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>; +def Q21 : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>; +def Q22 : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>; +def Q23 : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>; +def Q24 : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>; +def Q25 : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>; +def Q26 : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>; +def Q27 : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>; +def Q28 : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>; +def Q29 : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>; +def Q30 : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>; +def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>; +} + +def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { + let Size = 8; +} +def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { + let Size = 16; +} +def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; +def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, + v1i64, v4f16], + 64, (sequence "D%u", 0, 31)>; +// We don't (yet) have an f128 legal type, so don't use that here. We +// normalize 128-bit vectors to v2f64 for arg passing and such, so use +// that here. +def FPR128 : RegisterClass<"AArch64", + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, + v8f16], + 128, (sequence "Q%u", 0, 31)>; + +// The lower 16 vector registers. Some instructions can only take registers +// in this range. +def FPR128_lo : RegisterClass<"AArch64", + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + 128, (trunc FPR128, 16)>; + +// Pairs, triples, and quads of 64-bit vector registers. +def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>; +def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2)]>; +def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2), (rotl FPR64, 3)]>; +def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> { + let Size = 128; +} +def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> { + let Size = 196; +} +def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> { + let Size = 256; +} + +// Pairs, triples, and quads of 128-bit vector registers. +def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>; +def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2)]>; +def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2), (rotl FPR128, 3)]>; +def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> { + let Size = 256; +} +def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> { + let Size = 384; +} +def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { + let Size = 512; +} + + +// Vector operand versions of the FP registers. Alternate name printing and +// assmebler matching. +def VectorReg64AsmOperand : AsmOperandClass { + let Name = "VectorReg64"; + let PredicateMethod = "isVectorReg"; +} +def VectorReg128AsmOperand : AsmOperandClass { + let Name = "VectorReg128"; + let PredicateMethod = "isVectorReg"; +} + +def V64 : RegisterOperand<FPR64, "printVRegOperand"> { + let ParserMatchClass = VectorReg64AsmOperand; +} + +def V128 : RegisterOperand<FPR128, "printVRegOperand"> { + let ParserMatchClass = VectorReg128AsmOperand; +} + +def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; } +def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> { + let ParserMatchClass = VectorRegLoAsmOperand; +} + +class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind> + : AsmOperandClass { + let Name = "TypedVectorList" # count # "_" # lanes # kind; + + let PredicateMethod + = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>"; + let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">"; +} + +class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind> + : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '" + # kind # "'>">; + +multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> { + // With implicit types (probably on instruction instead). E.g. { v0, v1 } + def _64AsmOperand : AsmOperandClass { + let Name = NAME # "64"; + let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; + let RenderMethod = "addVectorList64Operands<" # count # ">"; + } + + def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand"); + } + + def _128AsmOperand : AsmOperandClass { + let Name = NAME # "128"; + let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; + let RenderMethod = "addVectorList128Operands<" # count # ">"; + } + + def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand"); + } + + // 64-bit register lists with explicit type. + + // { v0.8b, v1.8b } + def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">; + def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand"); + } + + // { v0.4h, v1.4h } + def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">; + def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand"); + } + + // { v0.2s, v1.2s } + def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">; + def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand"); + } + + // { v0.1d, v1.1d } + def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">; + def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand"); + } + + // 128-bit register lists with explicit type + + // { v0.16b, v1.16b } + def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">; + def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand"); + } + + // { v0.8h, v1.8h } + def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">; + def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand"); + } + + // { v0.4s, v1.4s } + def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">; + def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand"); + } + + // { v0.2d, v1.2d } + def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">; + def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand"); + } + + // { v0.b, v1.b } + def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">; + def "b" : TypedVecListRegOperand<Reg128, 0, "b"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand"); + } + + // { v0.h, v1.h } + def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">; + def "h" : TypedVecListRegOperand<Reg128, 0, "h"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand"); + } + + // { v0.s, v1.s } + def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">; + def "s" : TypedVecListRegOperand<Reg128, 0, "s"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand"); + } + + // { v0.d, v1.d } + def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">; + def "d" : TypedVecListRegOperand<Reg128, 0, "d"> { + let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand"); + } + + +} + +defm VecListOne : VectorList<1, FPR64, FPR128>; +defm VecListTwo : VectorList<2, DD, QQ>; +defm VecListThree : VectorList<3, DDD, QQQ>; +defm VecListFour : VectorList<4, DDDD, QQQQ>; + + +// Register operand versions of the scalar FP registers. +def FPR16Op : RegisterOperand<FPR16, "printOperand">; +def FPR32Op : RegisterOperand<FPR32, "printOperand">; +def FPR64Op : RegisterOperand<FPR64, "printOperand">; +def FPR128Op : RegisterOperand<FPR128, "printOperand">; + + +//===----------------------------------------------------------------------===// +// ARMv8.1a atomic CASP register operands + + +def WSeqPairs : RegisterTuples<[sube32, subo32], + [(rotl GPR32, 0), (rotl GPR32, 1)]>; +def XSeqPairs : RegisterTuples<[sube64, subo64], + [(rotl GPR64, 0), (rotl GPR64, 1)]>; + +def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, + (add WSeqPairs)>{ + let Size = 64; +} +def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, + (add XSeqPairs)>{ + let Size = 128; +} + + +let RenderMethod = "addRegOperands", ParserMethod="tryParseGPRSeqPair" in { + def WSeqPairsAsmOperandClass : AsmOperandClass { let Name = "WSeqPair"; } + def XSeqPairsAsmOperandClass : AsmOperandClass { let Name = "XSeqPair"; } +} + +def WSeqPairClassOperand : + RegisterOperand<WSeqPairsClass, "printGPRSeqPairsClassOperand<32>"> { + let ParserMatchClass = WSeqPairsAsmOperandClass; +} +def XSeqPairClassOperand : + RegisterOperand<XSeqPairsClass, "printGPRSeqPairsClassOperand<64>"> { + let ParserMatchClass = XSeqPairsAsmOperandClass; +} + + +//===----- END: v8.1a atomic CASP register operands -----------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td new file mode 100644 index 0000000..d709bee --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td @@ -0,0 +1,291 @@ +//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A53 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// Cortex-A53 machine model for scheduling and other instruction cost heuristics. +def CortexA53Model : SchedMachineModel { + let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation + // Specification - Instruction Timings" + // v 1.0 Spreadsheet +} + + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since +// Cortex-A53 is in-order. + +def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch +def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt + + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types which both map the ProcResources and +// set the latency. + +let SchedModel = CortexA53Model in { + +// ALU - Despite having a full latency of 4, most of the ALU instructions can +// forward a cycle earlier and then two cycles earlier in the case of a +// shift-only instruction. These latencies will be incorrect when the +// result cannot be forwarded, but modeling isn't rocket surgery. +def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; } +def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; } +def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; } +def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; } +def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; } +def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; } + +// MAC +def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; } +def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; } + +// Div +def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; } +def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; } + +// Load +def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// May model this more carefully in the future. The remaining +// A53WriteVLD# types represent the 1-5 cycle issues explicitly. +def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7; + let ResourceCycles = [4]; } +def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8; + let ResourceCycles = [5]; } + +// Pre/Post Indexing - Performed as part of address generation which is already +// accounted for in the WriteST* latencies below +def : WriteRes<WriteAdr, []> { let Latency = 0; } + +// Store +def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; } +def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2];} +def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } + +// Branch +def : WriteRes<WriteBr, [A53UnitB]>; +def : WriteRes<WriteBrReg, [A53UnitB]>; +def : WriteRes<WriteSys, [A53UnitB]>; +def : WriteRes<WriteBarrier, [A53UnitB]>; +def : WriteRes<WriteHint, [A53UnitB]>; + +// FP ALU +def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; } +def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; } +def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; } +def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18; + let ResourceCycles = [14]; } +def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17; + let ResourceCycles = [13]; } +def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; + let ResourceCycles = [28]; } + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// No forwarding for these reads. +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycles later. This same +// ReadAdvance applies to Extended registers as well, even though there is +// a separate SchedPredicate for them. +def : ReadAdvance<ReadI, 2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadISReg : SchedReadVariant<[ + SchedVar<RegShiftedPred, [A53ReadShifted]>, + SchedVar<NoSchedPred, [A53ReadNotShifted]>]>; +def : SchedAlias<ReadISReg, A53ReadISReg>; + +def A53ReadIEReg : SchedReadVariant<[ + SchedVar<RegExtendedPred, [A53ReadShifted]>, + SchedVar<NoSchedPred, [A53ReadNotShifted]>]>; +def : SchedAlias<ReadIEReg, A53ReadIEReg>; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; + +// Div +def : ReadAdvance<ReadID, 1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +//--- +// Miscellaneous +//--- +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads +//--- +def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td new file mode 100644 index 0000000..ca4457a --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td @@ -0,0 +1,661 @@ +//=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for ARM Cortex-A57 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Cortex-A57 is a traditional superscaler microprocessor with a +// conservative 3-wide in-order stage for decode and dispatch. Combined with the +// much wider out-of-order issue stage, this produced a need to carefully +// schedule micro-ops so that all three decoded each cycle are successfully +// issued as the reservation station(s) simply don't stay occupied for long. +// Therefore, IssueWidth is set to the narrower of the two at three, while still +// modeling the machine as out-of-order. + +def CortexA57Model : SchedMachineModel { + let IssueWidth = 3; // 3-way decode and dispatch + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. The magic number is chosen based on + // experiments and benchmarking data. + let LoopMicroOpBufferSize = 16; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cortex-A57. +// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where +// micro-ops wait for their operands and then issue out-of-order. + +def A57UnitB : ProcResource<1>; // Type B micro-ops +def A57UnitI : ProcResource<2>; // Type I micro-ops +def A57UnitM : ProcResource<1>; // Type M micro-ops +def A57UnitL : ProcResource<1>; // Type L micro-ops +def A57UnitS : ProcResource<1>; // Type S micro-ops +def A57UnitX : ProcResource<1>; // Type X micro-ops +def A57UnitW : ProcResource<1>; // Type W micro-ops +let SchedModel = CortexA57Model in { + def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops +} + +let SchedModel = CortexA57Model in { + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Cortex-A57. + +include "AArch64SchedA57WriteRes.td" + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// Cortex-A57. The Cortex-A57 types are directly associated with resources, so +// defining the aliases precludes the need for mapping them using WriteRes. The +// aliases are sufficient for creating a coarse, working model. As the model +// evolves, InstRWs will be used to override some of these SchedAliases. +// +// WARNING: Using SchedAliases is convenient and works well for latency and +// resource lookup for instructions. However, this creates an entry in +// AArch64WriteLatencyTable with a WriteResourceID of 0, breaking +// any SchedReadAdvance since the lookup will fail. + +def : SchedAlias<WriteImm, A57Write_1cyc_1I>; +def : SchedAlias<WriteI, A57Write_1cyc_1I>; +def : SchedAlias<WriteISReg, A57Write_2cyc_1M>; +def : SchedAlias<WriteIEReg, A57Write_2cyc_1M>; +def : SchedAlias<WriteExtr, A57Write_1cyc_1I>; +def : SchedAlias<WriteIS, A57Write_1cyc_1I>; +def : SchedAlias<WriteID32, A57Write_19cyc_1M>; +def : SchedAlias<WriteID64, A57Write_35cyc_1M>; +def : WriteRes<WriteIM32, [A57UnitM]> { let Latency = 3; } +def : WriteRes<WriteIM64, [A57UnitM]> { let Latency = 5; } +def : SchedAlias<WriteBr, A57Write_1cyc_1B>; +def : SchedAlias<WriteBrReg, A57Write_1cyc_1B>; +def : SchedAlias<WriteLD, A57Write_4cyc_1L>; +def : SchedAlias<WriteST, A57Write_1cyc_1S>; +def : SchedAlias<WriteSTP, A57Write_1cyc_1S>; +def : SchedAlias<WriteAdr, A57Write_1cyc_1I>; +def : SchedAlias<WriteLDIdx, A57Write_4cyc_1I_1L>; +def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>; +def : SchedAlias<WriteF, A57Write_3cyc_1V>; +def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>; +def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>; +def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>; +def : SchedAlias<WriteFImm, A57Write_3cyc_1V>; +def : SchedAlias<WriteFMul, A57Write_5cyc_1V>; +def : SchedAlias<WriteFDiv, A57Write_18cyc_1X>; +def : SchedAlias<WriteV, A57Write_3cyc_1V>; +def : SchedAlias<WriteVLD, A57Write_5cyc_1L>; +def : SchedAlias<WriteVST, A57Write_1cyc_1S>; + +def : WriteRes<WriteSys, []> { let Latency = 1; } +def : WriteRes<WriteBarrier, []> { let Latency = 1; } +def : WriteRes<WriteHint, []> { let Latency = 1; } + +def : WriteRes<WriteLDHi, []> { let Latency = 4; } + +// Forwarding logic is only modeled for multiply and accumulate +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; + + +//===----------------------------------------------------------------------===// +// Specialize the coarse model by associating instruction groups with the +// subtarget-defined types. As the modeled is refined, this will override most +// of the above ShchedAlias mappings. + +// Miscellaneous +// ----------------------------------------------------------------------------- + +def : InstRW<[WriteI], (instrs COPY)>; + + +// Branch Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1B_1I], (instrs BL)>; +def : InstRW<[A57Write_2cyc_1B_1I], (instrs BLR)>; + + +// Shifted Register with Shift == 0 +// ---------------------------------------------------------------------------- + +def A57WriteISReg : SchedWriteVariant<[ + SchedVar<RegShiftedPred, [WriteISReg]>, + SchedVar<NoSchedPred, [WriteI]>]>; +def : InstRW<[A57WriteISReg], (instregex ".*rs$")>; + + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +// Multiply high +def : InstRW<[A57Write_6cyc_1M], (instrs SMULHrr, UMULHrr)>; + + +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1I], (instrs EXTRWrri)>; +def : InstRW<[A57Write_3cyc_1I_1M], (instrs EXTRXrri)>; +def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>; + + +// Cryptography Extensions +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>; +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>; + + +// Vector Load +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1i(8|16|32)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1i(64)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Rv(1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Rv(1d)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_7cyc_3L], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_7cyc_3L, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2i(8|16)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD2i(32)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2i(32)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2i(64)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_5cyc_1L], (instregex "LD2Rv(1d)$")>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instregex "LD2Rv(1d)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD2Twov(2d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD2Twov(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3i(8|16)$")>; +def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3i(32)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3i(32)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD3i(64)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD3Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD3Rv(1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD3Rv(1d)_POST$")>; +def : InstRW<[A57Write_9cyc_1L_3V], (instregex "LD3Rv(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD3Rv(2d)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_10cyc_3L_4V], (instregex "LD3Threev(16b|8h|4s)$")>; +def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD3Threev(2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(8|16)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>; +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4i(32)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4i(32)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4i(64)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>; + +def : InstRW<[A57Write_8cyc_1L_2V], (instregex "LD4Rv(8b|4h|2s)$")>; +def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_2L], (instregex "LD4Rv(1d)$")>; +def : InstRW<[A57Write_6cyc_2L, WriteAdr], (instregex "LD4Rv(1d)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_3V], (instregex "LD4Rv(16b|8h|4s)$")>; +def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_9cyc_2L_4V], (instregex "LD4Rv(2d)$")>; +def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>; + +def : InstRW<[A57Write_9cyc_2L_2V], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_11cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_4L], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +// Vector Store +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_1cyc_1S], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST1i(64)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>; + +def : InstRW<[A57Write_1cyc_1S], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_1cyc_1S, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_3cyc_3S], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_6cyc_6S], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[A57Write_8cyc_8S], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; +def : InstRW<[A57Write_2cyc_2S], (instregex "ST2i(64)$")>; +def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>; + +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST2Twov(16b|8h|4s)$")>; +def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST2Twov(2d)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST2Twov(2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST3i(8|16)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST3i(8|16)_POST$")>; +def : InstRW<[A57Write_3cyc_3S], (instregex "ST3i(32)$")>; +def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>; +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST3i(64)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>; + +def : InstRW<[A57Write_3cyc_3S_2V], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_6cyc_6S_4V], (instregex "ST3Threev(16b|8h|4s)$")>; +def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_6cyc_6S], (instregex "ST3Threev(2d)$")>; +def : InstRW<[A57Write_6cyc_6S, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[A57Write_3cyc_1S_1V], (instregex "ST4i(8|16)$")>; +def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST4i(8|16)_POST$")>; +def : InstRW<[A57Write_4cyc_4S], (instregex "ST4i(32)$")>; +def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>; +def : InstRW<[A57Write_3cyc_2S_1V], (instregex "ST4i(64)$")>; +def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST4i(64)_POST$")>; + +def : InstRW<[A57Write_4cyc_4S_2V], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[A57Write_8cyc_8S_4V], (instregex "ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[A57Write_8cyc_8S], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +// Vector - Integer +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 +// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 +// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 + +// ASIMD absolute diff accum, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B/8H +def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B +def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B +def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; + +// ASIMD multiply, D-form +def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>; +def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>; + +// ASIMD multiply long +def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>; +def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; + +// ASIMD shift by immed, complex +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>; + + +// ASIMD shift by register, basic, Q-form +def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, complex, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; + +// ASIMD shift by register, complex, Q-form +def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + + +// Vector - Floating Point +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v2f32 +// Q form - v4f32, v2f64 +// D form - 32, 64 +// D form - v1i32, v1i64 +// D form - v2i32 +// Q form - v4i32, v2i64 + +// ASIMD FP arith, normal, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>; +// ASIMD FP arith, normal, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>; + +// ASIMD FP arith, pairwise, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>; +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>; + +// ASIMD FP compare, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>; +// ASIMD FP compare, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP convert, long and narrow +def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[A57Write_18cyc_1X], (instregex "FDIVv2f32")>; +// ASIMD FP divide, Q-form, F32 +def : InstRW<[A57Write_36cyc_2X], (instregex "FDIVv4f32")>; +// ASIMD FP divide, Q-form, F64 +def : InstRW<[A57Write_64cyc_2X], (instregex "FDIVv2f64")>; + +// Note: These were simply duplicated from ASIMD FDIV because of missing documentation +// ASIMD FP square root, D-form, F32 +def : InstRW<[A57Write_18cyc_1X], (instregex "FSQRTv2f32")>; +// ASIMD FP square root, Q-form, F32 +def : InstRW<[A57Write_36cyc_2X], (instregex "FSQRTv4f32")>; +// ASIMD FP square root, Q-form, F64 +def : InstRW<[A57Write_64cyc_2X], (instregex "FSQRTv2f64")>; + +// ASIMD FP max/min, normal, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>; +// ASIMD FP max/min, normal, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>; +// ASIMD FP max/min, pairwise, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>; +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>; +// ASIMD FP max/min, reduce +def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; + +// ASIMD FP multiply, D-form, FZ +def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +// ASIMD FP multiply, Q-form, FZ +def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, D-form, FZ +// ASIMD FP multiply accumulate, Q-form, FZ +def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>; +def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP round, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + + +// Vector - Miscellaneous +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 + +// ASIMD bitwise insert, Q-form +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; + +// ASIMD duplicate, gen reg, D-form and Q-form +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>; + +// ASIMD move, saturating +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>; + +// ASIMD reciprocal estimate, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD reciprocal estimate, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>; + +// ASIMD reciprocal step, D-form, FZ +def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>; +// ASIMD reciprocal step, Q-form, FZ +def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>; + +// ASIMD table lookup, D-form +def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>; +// ASIMD table lookup, Q-form +def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, gen reg to element +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>; + +// ASIMD unzip/zip, Q-form +def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; + + +// Remainder +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; + +def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>; +def A57ReadFPM : SchedReadAdvance<0>; +def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; + +def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>; + +def : InstRW<[A57Write_32cyc_1X], (instrs FDIVDrr)>; +def : InstRW<[A57Write_18cyc_1X], (instrs FDIVSrr)>; + +def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>; + +def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>; + +def : InstRW<[A57Write_32cyc_1X], (instrs FSQRTDr)>; +def : InstRW<[A57Write_18cyc_1X], (instrs FSQRTSr)>; + +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>; + +def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>; +def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>; +def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>; +def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>; +def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>; +def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>; +def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>; +def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>; + +} // SchedModel = CortexA57Model diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td new file mode 100644 index 0000000..6f30108 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -0,0 +1,544 @@ +//=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: A57Write +// Latency: #cyc +// MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// +// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are +// 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } +def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } +def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } +def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; + let ResourceCycles = [18]; } +def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; + let ResourceCycles = [19]; } +def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } +def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } +def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } +def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } +def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; + let ResourceCycles = [35]; } +def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } +def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } +def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } +def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } +def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } +def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } +def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } + + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 64; + let NumMicroOps = 2; + let ResourceCycles = [32, 32]; +} +def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_7cyc_1V_1X : SchedWriteRes<[A57UnitV, + A57UnitX]> { + let Latency = 7; + let NumMicroOps = 2; +} +def A57Write_8cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_9cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 2; +} +def A57Write_8cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 8; + let NumMicroOps = 2; +} +def A57Write_6cyc_2L : SchedWriteRes<[A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} +def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} +def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, + A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_10cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 2; +} +def A57Write_1cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_1cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} +def A57Write_2cyc_1B_1I : SchedWriteRes<[A57UnitB, + A57UnitI]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2S : SchedWriteRes<[A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 2; + let NumMicroOps = 2; +} +def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 36; + let NumMicroOps = 2; + let ResourceCycles = [18, 18]; +} +def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, + A57UnitM]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1I_1S : SchedWriteRes<[A57UnitI, + A57UnitS]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} +def A57Write_4cyc_1I_1L : SchedWriteRes<[A57UnitI, + A57UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} +def A57Write_4cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 4; + let NumMicroOps = 2; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def A57Write_10cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 3; +} +def A57Write_2cyc_1I_2S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} +def A57Write_3cyc_1I_1S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_1M_2S : SchedWriteRes<[A57UnitM, + A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_3S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_3cyc_2S_1V : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 3; +} +def A57Write_5cyc_1I_2L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 5; + let NumMicroOps = 3; +} +def A57Write_6cyc_1I_2L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} +def A57Write_6cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 3; +} +def A57Write_7cyc_3L : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL]> { + let Latency = 7; + let NumMicroOps = 3; +} +def A57Write_8cyc_1I_1L_1V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_8cyc_1L_2V : SchedWriteRes<[A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_8cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 3; +} +def A57Write_9cyc_3V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 3; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 4 micro-op types + +def A57Write_2cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 2; + let NumMicroOps = 4; +} +def A57Write_3cyc_2I_2S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_3cyc_1I_3S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_3cyc_1I_2S_1V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitV]> { + let Latency = 3; + let NumMicroOps = 4; +} +def A57Write_4cyc_4S : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 4; +} +def A57Write_7cyc_1I_3L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, A57UnitL]> { + let Latency = 7; + let NumMicroOps = 4; +} +def A57Write_5cyc_2I_2L : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitL, A57UnitL]> { + let Latency = 5; + let NumMicroOps = 4; +} +def A57Write_8cyc_1I_1L_2V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 4; +} +def A57Write_8cyc_4L : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitL, A57UnitL]> { + let Latency = 8; + let NumMicroOps = 4; +} +def A57Write_9cyc_2L_2V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} +def A57Write_9cyc_1L_3V : SchedWriteRes<[A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 4; +} +def A57Write_12cyc_4V : SchedWriteRes<[A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 12; + let NumMicroOps = 4; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 5 micro-op types + +def A57Write_3cyc_3S_2V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 5; +} +def A57Write_8cyc_1I_4L : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitL, A57UnitL]> { + let Latency = 8; + let NumMicroOps = 5; +} +def A57Write_4cyc_1I_4S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 5; +} +def A57Write_9cyc_1I_2L_2V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} +def A57Write_9cyc_1I_1L_3V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} +def A57Write_9cyc_2L_3V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} +def A57Write_9cyc_5V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 6 micro-op types + +def A57Write_3cyc_1I_3S_2V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 6; +} +def A57Write_4cyc_2I_4S : SchedWriteRes<[A57UnitI, A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 4; + let NumMicroOps = 6; +} +def A57Write_4cyc_4S_2V : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 4; + let NumMicroOps = 6; +} +def A57Write_6cyc_6S : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 6; + let NumMicroOps = 6; +} +def A57Write_9cyc_1I_2L_3V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} +def A57Write_9cyc_1I_1L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} +def A57Write_9cyc_2L_4V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 6; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 7 micro-op types + +def A57Write_10cyc_3L_4V : SchedWriteRes<[A57UnitL, A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 7; +} +def A57Write_4cyc_1I_4S_2V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV]> { + let Latency = 4; + let NumMicroOps = 7; +} +def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS]> { + let Latency = 6; + let NumMicroOps = 7; +} +def A57Write_9cyc_1I_2L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 7; +} +def A57Write_12cyc_7V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 12; + let NumMicroOps = 7; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 8 micro-op types + +def A57Write_10cyc_1I_3L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 10; + let NumMicroOps = 8; +} +def A57Write_11cyc_4L_4V : SchedWriteRes<[A57UnitL, A57UnitL, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 11; + let NumMicroOps = 8; +} +def A57Write_8cyc_8S : SchedWriteRes<[A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 8; + let NumMicroOps = 8; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 9 micro-op types + +def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { + let Latency = 8; + let NumMicroOps = 9; +} +def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI, + A57UnitL, A57UnitL, + A57UnitL, A57UnitL, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 11; + let NumMicroOps = 9; +} +def A57Write_15cyc_9V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 15; + let NumMicroOps = 9; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 10 micro-op types + +def A57Write_6cyc_6S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 10; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 11 micro-op types + +def A57Write_6cyc_1I_6S_4V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 6; + let NumMicroOps = 11; +} + + +//===----------------------------------------------------------------------===// +// Define Generic 12 micro-op types + +def A57Write_8cyc_8S_4V : SchedWriteRes<[A57UnitS, A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 12; +} + +//===----------------------------------------------------------------------===// +// Define Generic 13 micro-op types + +def A57Write_8cyc_1I_8S_4V : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 8; + let NumMicroOps = 13; +} + diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td new file mode 100644 index 0000000..a2a1802 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td @@ -0,0 +1,865 @@ +//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AArch64 Cyclone to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def CycloneModel : SchedMachineModel { + let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 16; // 14-19 cycles are typical. +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cyclone. + +// 4 integer pipes +def CyUnitI : ProcResource<4> { + let BufferSize = 48; +} + +// 2 branch units: I[0..1] +def CyUnitB : ProcResource<2> { + let Super = CyUnitI; + let BufferSize = 24; +} + +// 1 indirect-branch unit: I[0] +def CyUnitBR : ProcResource<1> { + let Super = CyUnitB; +} + +// 2 shifter pipes: I[2..3] +// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI +def CyUnitIS : ProcResource<2> { + let Super = CyUnitI; + let BufferSize = 24; +} + +// 1 mul pipe: I[0] +def CyUnitIM : ProcResource<1> { + let Super = CyUnitBR; + let BufferSize = 32; +} + +// 1 div pipe: I[1] +def CyUnitID : ProcResource<1> { + let Super = CyUnitB; + let BufferSize = 16; +} + +// 1 integer division unit. This is driven by the ID pipe, but only +// consumes the pipe for one cycle at issue and another cycle at writeback. +def CyUnitIntDiv : ProcResource<1>; + +// 2 ld/st pipes. +def CyUnitLS : ProcResource<2> { + let BufferSize = 28; +} + +// 3 fp/vector pipes. +def CyUnitV : ProcResource<3> { + let BufferSize = 48; +} +// 2 fp/vector arithmetic and multiply pipes: V[0-1] +def CyUnitVM : ProcResource<2> { + let Super = CyUnitV; + let BufferSize = 32; +} +// 1 fp/vector division/sqrt pipe: V[2] +def CyUnitVD : ProcResource<1> { + let Super = CyUnitV; + let BufferSize = 16; +} +// 1 fp compare pipe: V[0] +def CyUnitVC : ProcResource<1> { + let Super = CyUnitVM; + let BufferSize = 16; +} + +// 2 fp division/square-root units. These are driven by the VD pipe, +// but only consume the pipe for one cycle at issue and a cycle at writeback. +def CyUnitFloatDiv : ProcResource<2>; + +//===----------------------------------------------------------------------===// +// Define scheduler read/write resources and latency on Cyclone. +// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. + +let SchedModel = CycloneModel in { + +//--- +// 7.8.1. Moves +//--- + +// A single nop micro-op (uX). +def WriteX : SchedWriteRes<[]> { let Latency = 0; } + +// Move zero is a register rename (to machine register zero). +// The move is replaced by a single nop micro-op. +// MOVZ Rd, #0 +// AND Rd, Rzr, #imm +def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; +def WriteImmZ : SchedWriteVariant<[ + SchedVar<WriteZPred, [WriteX]>, + SchedVar<NoSchedPred, [WriteImm]>]>; +def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; + +// Move GPR is a register rename and single nop micro-op. +// ORR Xd, XZR, Xm +// ADD Xd, Xn, #0 +def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; +def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; +def WriteMov : SchedWriteVariant<[ + SchedVar<WriteIMovPred, [WriteX]>, + SchedVar<WriteVMovPred, [WriteX]>, + SchedVar<NoSchedPred, [WriteI]>]>; +def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; + +// Move non-zero immediate is an integer ALU op. +// MOVN,MOVZ,MOVK +def : WriteRes<WriteImm, [CyUnitI]>; + +//--- +// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, +// Shifts and Bitfield Operations +//--- + +// ADR,ADRP +// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri +// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr +// ADC(S),SBC(S) +// Aliases: CMN, CMP, TST +// +// Conditional operations. +// CCMNi,CCMPi,CCMNr,CCMPr, +// CSEL,CSINC,CSINV,CSNEG +// +// Bit counting and reversal operations. +// CLS,CLZ,RBIT,REV,REV16,REV32 +def : WriteRes<WriteI, [CyUnitI]>; + +// ADD with shifted register operand is a single micro-op that +// consumes a shift pipeline for two cycles. +// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs +// EXAMPLE: ADDrs Xn, Xm LSL #imm +def : WriteRes<WriteISReg, [CyUnitIS]> { + let Latency = 2; + let ResourceCycles = [2]; +} + +// ADD with extended register operand is the same as shifted reg operand. +// ADD(S)re,SUB(S)re +// EXAMPLE: ADDXre Xn, Xm, UXTB #1 +def : WriteRes<WriteIEReg, [CyUnitIS]> { + let Latency = 2; + let ResourceCycles = [2]; +} + +// Variable shift and bitfield operations. +// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM +def : WriteRes<WriteIS, [CyUnitIS]>; + +// EXTR Shifts a pair of registers and requires two micro-ops. +// The second micro-op is delayed, as modeled by ReadExtrHi. +// EXTR Xn, Xm, #imm +def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// EXTR's first register read is delayed by one cycle, effectively +// shortening its writer's latency. +// EXTR Xn, Xm, #imm +def : ReadAdvance<ReadExtrHi, 1>; + +//--- +// 7.8.6. Multiplies +//--- + +// MUL/MNEG are aliases for MADD/MSUB. +// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL +def : WriteRes<WriteIM32, [CyUnitIM]> { + let Latency = 4; +} +// MADDX,MSUBX,SMULH,UMULH +def : WriteRes<WriteIM64, [CyUnitIM]> { + let Latency = 5; +} + +//--- +// 7.8.7. Divide +//--- + +// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. +// The ID pipe is consumed for 2 cycles: issue and writeback. +// SDIVW,UDIVW +def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { + let Latency = 10; + let ResourceCycles = [2, 10]; +} +// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. +// The ID pipe is consumed for 2 cycles: issue and writeback. +// SDIVX,UDIVX +def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { + let Latency = 13; + let ResourceCycles = [2, 13]; +} + +//--- +// 7.8.8,7.8.10. Load/Store, single element +//--- + +// Integer loads take 4 cycles and use one LS unit for one cycle. +def : WriteRes<WriteLD, [CyUnitLS]> { + let Latency = 4; +} + +// Store-load forwarding is 4 cycles. +// +// Note: The store-exclusive sequence incorporates this +// latency. However, general heuristics should not model the +// dependence between a store and subsequent may-alias load because +// hardware speculation works. +def : WriteRes<WriteST, [CyUnitLS]> { + let Latency = 4; +} + +// Load from base address plus an optionally scaled register offset. +// Rt latency is latency WriteIS + WriteLD. +// EXAMPLE: LDR Xn, Xm [, lsl 3] +def CyWriteLDIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. + SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. +def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. + +// EXAMPLE: STR Xn, Xm [, lsl 3] +def CyWriteSTIdx : SchedWriteVariant<[ + SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. + SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. +def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. + +// Read the (unshifted) base register Xn in the second micro-op one cycle later. +// EXAMPLE: LDR Xn, Xm [, lsl 3] +def ReadBaseRS : SchedReadAdvance<1>; +def CyReadAdrBase : SchedReadVariant<[ + SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. + SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. +def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. + +//--- +// 7.8.9,7.8.11. Load/Store, paired +//--- + +// Address pre/post increment is a simple ALU op with one cycle latency. +def : WriteRes<WriteAdr, [CyUnitI]>; + +// LDP high register write is fused with the load, but a nop micro-op remains. +def : WriteRes<WriteLDHi, []> { + let Latency = 4; +} + +// STP is a vector op and store, except for QQ, which is just two stores. +def : SchedAlias<WriteSTP, WriteVSTShuffle>; +def : InstRW<[WriteST, WriteST], (instrs STPQi)>; + +//--- +// 7.8.13. Branches +//--- + +// Branches take a single micro-op. +// The misprediction penalty is defined as a SchedMachineModel property. +def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} +def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} + +//--- +// 7.8.14. Never-issued Instructions, Barrier and Hint Operations +//--- + +// NOP,SEV,SEVL,WFE,WFI,YIELD +def : WriteRes<WriteHint, []> {let Latency = 0;} +// ISB +def : InstRW<[WriteI], (instrs ISB)>; +// SLREX,DMB,DSB +def : WriteRes<WriteBarrier, [CyUnitLS]>; + +// System instructions get an invalid latency because the latency of +// other operations across them is meaningless. +def : WriteRes<WriteSys, []> {let Latency = -1;} + +//===----------------------------------------------------------------------===// +// 7.9 Vector Unit Instructions + +// Simple vector operations take 2 cycles. +def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} + +// Define some longer latency vector op types for Cyclone. +def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} +def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} +def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} +def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} + +// Simple floating-point operations take 2 cycles. +def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} + +//--- +// 7.9.1 Vector Moves +//--- + +// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently +// generates expensive int-float conversion instead: +// FMOVDi Dd, #0.0 +// FMOVv2f64ns Vd.2d, #0.0 + +// FMOVSi,FMOVDi +def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} + +// MOVI,MVNI are WriteV +// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV + +// Move FPR is a register rename and single nop micro-op. +// ORR.16b Vd,Vn,Vn +// COPY is handled above in the WriteMov Variant. +def WriteVMov : SchedWriteVariant<[ + SchedVar<WriteVMovPred, [WriteX]>, + SchedVar<NoSchedPred, [WriteV]>]>; +def : InstRW<[WriteVMov], (instrs ORRv16i8)>; + +// FMOVSr,FMOVDr are WriteF. + +// MOV V,V is a WriteV. + +// CPY D,V[x] is a WriteV + +// INS V[x],V[y] is a WriteV. + +// FMOVWSr,FMOVXDr,FMOVXDHighr +def : WriteRes<WriteFCopy, [CyUnitLS]> { + let Latency = 5; +} + +// FMOVSWr,FMOVDXr +def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; + +// INS V[x],R +def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; +def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; + +// SMOV,UMOV R,V[x] +def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; +def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; + +// DUP V,R +def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; + +// DUP V,V[x] is a WriteV. + +//--- +// 7.9.2 Integer Arithmetic, Logical, and Comparisons +//--- + +// BIC,ORR V,#imm are WriteV + +def : InstRW<[CyWriteV3], (instregex "ABSv")>; + +// MVN,NEG,NOT are WriteV + +def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; + +// ADDP is a WriteV. +def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} +def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; + +def : InstRW<[CyWriteV3], + (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; + +def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; + +// ADD,SUB are WriteV + +// Forward declare. +def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} + +// Add/Diff and accumulate uses the vector multiply unit. +def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} +def CyReadVAccum : SchedReadAdvance<1, + [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; + +def : InstRW<[CyWriteVAccum, CyReadVAccum], + (instregex "SADALP","UADALP")>; + +def : InstRW<[CyWriteVAccum, CyReadVAccum], + (instregex "SABAv","UABAv","SABALv","UABALv")>; + +def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; + +def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; + +def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; + +// WriteV includes: +// AND,BIC,CMTST,EOR,ORN,ORR +// ADDP +// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD +// SADDL,SSUBL,UADDL,USUBL +// SADDW,SSUBW,UADDW,USUBW + +def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", + "CMLEv","CMLTv", + "CMHIv","CMHSv")>; + +def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", + "SMAXPv","SMINPv","UMAXPv","UMINPv")>; + +def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", + "SABDLv","UABDLv")>; + +//--- +// 7.9.3 Floating Point Arithmetic and Comparisons +//--- + +// FABS,FNEG are WriteF + +def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; +def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; + +def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", + "FMINPv2i","FMINNMPv2i")>; + +def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; + +def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, + FSUBSrr,FSUBv2f32,FSUBv4f32, + FADDPv2f32,FADDPv4f32, + FABD32,FABDv2f32,FABDv4f32)>; +def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, + FSUBDrr,FSUBv2f64, + FADDPv2f64, + FABD64,FABDv2f64)>; + +def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; + +def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", + "FMAXS","FMAXD","FMAXv", + "FMINS","FMIND","FMINv", + "FMAXNMS","FMAXNMD","FMAXNMv", + "FMINNMS","FMINNMD","FMINNMv", + "FMAXPv2f","FMAXPv4f", + "FMINPv2f","FMINPv4f", + "FMAXNMPv2f","FMAXNMPv4f", + "FMINNMPv2f","FMINNMPv4f")>; + +// FCMP,FCMPE,FCCMP,FCCMPE +def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} + +// FCSEL is a WriteF. + +//--- +// 7.9.4 Shifts and Bitfield Operations +//--- + +// SHL is a WriteV + +def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} +def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; + +def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} +def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; + +// Shift and accumulate uses the vector multiply unit. +def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} +def CyReadVShiftAcc : SchedReadAdvance<1, + [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; +def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// SSHL,USHL are WriteV. + +def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; + +// SQSHL,SQSHLU,UQSHL are WriteV. + +def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; + +// WriteV includes: +// SHLL,SSHLL,USHLL +// SLI,SRI +// BIF,BIT,BSL +// EXT +// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN +// XTN2 + +def : InstRW<[CyWriteV4], + (instregex "RSHRNv","SHRNv", + "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", + "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; + +//--- +// 7.9.5 Multiplication +//--- + +def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} +def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", + "SQDMULLv","SQDMULHv","SQRDMULHv")>; + +// FMUL,FMULX,FNMUL default to WriteFMul. +def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} + +def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} +def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, + FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; + +def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; +def : InstRW<[CyWriteVMul, CyReadVMulAcc], + (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", + "SQDMLAL","SQDMLSL")>; + +def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} +def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} +def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; +def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; + +def : InstRW<[CyWriteSMul, CyReadSMul], + (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, + FMLAv2f32,FMLAv4f32, + FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; +def : InstRW<[CyWriteDMul, CyReadDMul], + (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, + FMLAv2f64,FMLAv2i64_indexed, + FMLSv2f64,FMLSv2i64_indexed)>; + +def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } +def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; + +//--- +// 7.9.6 Divide and Square Root +//--- + +// FDIV,FSQRT +// TODO: Add 64-bit variant with 19 cycle latency. +// TODO: Specialize FSQRT for longer latency. +def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { + let Latency = 17; + let ResourceCycles = [2, 17]; +} + +def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; + +def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } +def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; + +def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } +def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } +def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; +def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; + +//--- +// 7.9.7 Integer-FP Conversions +//--- + +// FCVT lengthen f16/s32 +def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; + +// FCVT,FCVTN,FCVTXN +// SCVTF,UCVTF V,V +// FRINT(AIMNPXZ) V,V +def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} + +// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. +def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; +def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; + +// FCVT Rd, S/D = V6+LD4: 10 cycles +def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; +def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; + +// FCVTL is a WriteV + +//--- +// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup +//--- + +def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} +def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, + AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, + SHA1SU0rrr)>; + +def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} +def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; + +def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} +def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, + SHA256Hrrr,SHA256H2rrr)>; + +// TRN,UZP,ZUP are WriteV. + +// TBL,TBX are WriteV. + +//--- +// 7.9.11-7.9.14 Load/Store, single element and paired +//--- + +// Loading into the vector unit takes 5 cycles vs 4 for integer loads. +def : WriteRes<WriteVLD, [CyUnitLS]> { + let Latency = 5; +} + +// Store-load forwarding is 4 cycles. +def : WriteRes<WriteVST, [CyUnitLS]> { + let Latency = 4; +} + +// WriteVLDPair/VSTPair sequences are expanded by the target description. + +//--- +// 7.9.15 Load, element operations +//--- + +// Only the first WriteVLD and WriteAdr for writeback matches def operands. +// Subsequent WriteVLDs consume resources. Since all loaded values have the +// same latency, this is acceptable. + +// Vd is read 5 cycles after issuing the vector load. +def : ReadAdvance<ReadVLD, 5>; + +def : InstRW<[WriteVLD], + (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], + (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +// Register writes from the load's high half are fused micro-ops. +def : InstRW<[WriteVLD], + (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], + (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], + (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD], + (instregex "LD1i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], + (instregex "LD1i(8|16|32)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; + +def : InstRW<[WriteVLDShuffle], + (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr], + (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[WriteVLDShuffle, WriteV], + (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], + (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], + (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], + (instregex "LD2i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], + (instregex "LD2i(8|16|32)_POST")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], + (instregex "LD2i64$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], + (instregex "LD2i64_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteV], + (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], + (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], + (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], + (instregex "LD3Threev(8b|4h|2s)_POST")>; +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], + (instregex "LD3i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], + (instregex "LD3i(8|16|32)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], + (instregex "LD3i64$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], + (instregex "LD3i64_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteV, WriteV], + (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], + (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], + (instrs LD3Rv1d,LD3Rv2d)>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], + (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], + (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], + (instregex "LD4Fourv(8b|4h|2s)_POST")>; +def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, + WriteVLDPairShuffle, WriteVLDPairShuffle], + (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, + WriteVLDPairShuffle, WriteVLDPairShuffle], + (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], + (instregex "LD4i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], + (instregex "LD4i(8|16|32)_POST")>; + + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4i64)>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], + (instrs LD4i64_POST)>; + +def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], + (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], + (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4Rv1d,LD4Rv2d)>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; + +//--- +// 7.9.16 Store, element operations +//--- + +// Only the WriteAdr for writeback matches a def operands. +// Subsequent WriteVLDs only consume resources. + +def : InstRW<[WriteVST], + (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST], + (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], + (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], + (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST], + (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST], + (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVST], + (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], + (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST, WriteVST], + (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], + (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], + (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], + (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; + +def : InstRW<[WriteVSTShuffle], + (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], + (instregex "ST2Twov(8b|4h|2s)_POST")>; +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; +def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(8b|4h|2s)_POST")>; +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; + +def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; +def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; + +def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, + WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, + WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; + +//--- +// Unused SchedRead types +//--- + +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; + +} // SchedModel = CycloneModel diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td new file mode 100644 index 0000000..eaa9110 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td @@ -0,0 +1,104 @@ +//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Define TII for use in SchedVariant Predicates. +// const MachineInstr *MI and const TargetSchedModel *SchedModel +// are defined by default. +def : PredicateProlog<[{ + const AArch64InstrInfo *TII = + static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo()); + (void)TII; +}]>; + +// AArch64 Scheduler Definitions + +def WriteImm : SchedWrite; // MOVN, MOVZ +// TODO: Provide variants for MOV32/64imm Pseudos that dynamically +// select the correct sequence of WriteImms. + +def WriteI : SchedWrite; // ALU +def WriteISReg : SchedWrite; // ALU of Shifted-Reg +def WriteIEReg : SchedWrite; // ALU of Extended-Reg +def ReadI : SchedRead; // ALU +def ReadISReg : SchedRead; // ALU of Shifted-Reg +def ReadIEReg : SchedRead; // ALU of Extended-Reg +def WriteExtr : SchedWrite; // EXTR shifts a reg pair +def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair +def WriteIS : SchedWrite; // Shift/Scale +def WriteID32 : SchedWrite; // 32-bit Divide +def WriteID64 : SchedWrite; // 64-bit Divide +def ReadID : SchedRead; // 32/64-bit Divide +def WriteIM32 : SchedWrite; // 32-bit Multiply +def WriteIM64 : SchedWrite; // 64-bit Multiply +def ReadIM : SchedRead; // 32/64-bit Multiply +def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate +def WriteBr : SchedWrite; // Branch +def WriteBrReg : SchedWrite; // Indirect Branch + +def WriteLD : SchedWrite; // Load from base addr plus immediate offset +def WriteST : SchedWrite; // Store to base addr plus immediate offset +def WriteSTP : SchedWrite; // Store a register pair. +def WriteAdr : SchedWrite; // Address pre/post increment. + +def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). +def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). +def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. + +// Predicate for determining when a shiftable register is shifted. +def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>; + +// Predicate for determining when a extendedable register is extended. +def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>; + +// ScaledIdxPred is true if a WriteLDIdx operand will be +// scaled. Subtargets can use this to dynamically select resources and +// latency for WriteLDIdx and ReadAdrBase. +def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>; + +// Serialized two-level address load. +// EXAMPLE: LOADGot +def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>; + +// Serialized two-level address lookup. +// EXAMPLE: MOVaddr... +def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>; + +// The second register of a load-pair. +// LDP,LDPSW,LDNP,LDXP,LDAXP +def WriteLDHi : SchedWrite; + +// Store-exclusive is a store followed by a dependent load. +def WriteSTX : WriteSequence<[WriteST, WriteLD]>; + +def WriteSys : SchedWrite; // Long, variable latency system ops. +def WriteBarrier : SchedWrite; // Memory barrier. +def WriteHint : SchedWrite; // Hint instruction. + +def WriteF : SchedWrite; // General floating-point ops. +def WriteFCmp : SchedWrite; // Floating-point compare. +def WriteFCvt : SchedWrite; // Float conversion. +def WriteFCopy : SchedWrite; // Float-int register copy. +def WriteFImm : SchedWrite; // Floating-point immediate. +def WriteFMul : SchedWrite; // Floating-point multiply. +def WriteFDiv : SchedWrite; // Floating-point division. + +def WriteV : SchedWrite; // Vector ops. +def WriteVLD : SchedWrite; // Vector loads. +def WriteVST : SchedWrite; // Vector stores. + +// Read the unwritten lanes of the VLD's destination registers. +def ReadVLD : SchedRead; + +// Sequential vector load and shuffle. +def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>; +def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>; + +// Store a shuffled vector. +def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>; +def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp new file mode 100644 index 0000000..f402930 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -0,0 +1,53 @@ +//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64SelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-selectiondag-info" + +SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + // Check to see if there is a specialized entry-point for memory zeroing. + ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); + ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size); + const AArch64Subtarget &STI = + DAG.getMachineFunction().getSubtarget<AArch64Subtarget>(); + const char *bzeroEntry = + (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr; + // For small size (< 256), it is not beneficial to use bzero + // instead of memset. + if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { + const AArch64TargetLowering &TLI = *STI.getTargetLowering(); + + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0) + .setDiscardResult(); + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; + } + return SDValue(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h new file mode 100644 index 0000000..97421b4 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -0,0 +1,31 @@ +//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AArch64 subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { +public: + + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, + unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const override; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp new file mode 100644 index 0000000..1c6b157 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -0,0 +1,163 @@ +//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies floating point stores that should not be combined into +// store pairs. Later we may do the same for floating point loads. +// ===---------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-stp-suppress" + +namespace { +class AArch64StorePairSuppress : public MachineFunctionPass { + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + TargetSchedModel SchedModel; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + +public: + static char ID; + AArch64StorePairSuppress() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "AArch64 Store Pair Suppression"; + } + + bool runOnMachineFunction(MachineFunction &F) override; + +private: + bool shouldAddSTPToBlock(const MachineBasicBlock *BB); + + bool isNarrowFPStore(const MachineInstr &MI); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineTraceMetrics>(); + AU.addPreserved<MachineTraceMetrics>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64StorePairSuppress::ID = 0; +} // anonymous + +FunctionPass *llvm::createAArch64StorePairSuppressPass() { + return new AArch64StorePairSuppress(); +} + +/// Return true if an STP can be added to this block without increasing the +/// critical resource height. STP is good to form in Ld/St limited blocks and +/// bad to form in float-point limited blocks. This is true independent of the +/// critical path. If the critical path is longer than the resource height, the +/// extra vector ops can limit physreg renaming. Otherwise, it could simply +/// oversaturate the vector units. +bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) { + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB); + unsigned ResLength = BBTrace.getResourceLength(); + + // Get the machine model's scheduling class for STPQi. + // Bypass TargetSchedule's SchedClass resolution since we only have an opcode. + unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass(); + const MCSchedClassDesc *SCDesc = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + + // If a subtarget does not define resources for STPQi, bail here. + if (SCDesc->isValid() && !SCDesc->isVariant()) { + unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc); + if (ResLenWithSTP > ResLength) { + DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber() + << " resources " << ResLength << " -> " << ResLenWithSTP + << "\n"); + return false; + } + } + return true; +} + +/// Return true if this is a floating-point store smaller than the V reg. On +/// cyclone, these require a vector shuffle before storing a pair. +/// Ideally we would call getMatchingPairOpcode() and have the machine model +/// tell us if it's profitable with no cpu knowledge here. +/// +/// FIXME: We plan to develop a decent Target abstraction for simple loads and +/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer. +bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STURSi: + case AArch64::STURDi: + return true; + } +} + +bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &ST = MF.getSubtarget(); + TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + SchedModel.init(ST.getSchedModel(), &ST, TII); + Traces = &getAnalysis<MachineTraceMetrics>(); + MinInstr = nullptr; + + DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n'); + + if (!SchedModel.hasInstrSchedModel()) { + DEBUG(dbgs() << " Skipping pass: no machine model present.\n"); + return false; + } + + // Check for a sequence of stores to the same base address. We don't need to + // precisely determine whether a store pair can be formed. But we do want to + // filter out most situations where we can't form store pairs to avoid + // computing trace metrics in those cases. + for (auto &MBB : MF) { + bool SuppressSTP = false; + unsigned PrevBaseReg = 0; + for (auto &MI : MBB) { + if (!isNarrowFPStore(MI)) + continue; + unsigned BaseReg; + unsigned Offset; + if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { + if (PrevBaseReg == BaseReg) { + // If this block can take STPs, skip ahead to the next block. + if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) + break; + // Otherwise, continue unpairing the stores in this block. + DEBUG(dbgs() << "Unpairing store " << MI << "\n"); + SuppressSTP = true; + TII->suppressLdStPair(&MI); + } + PrevBaseReg = BaseReg; + } else + PrevBaseReg = 0; + } + } + // This pass just sets some internal MachineMemOperand flags. It can't really + // invalidate anything. + return false; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp new file mode 100644 index 0000000..f6ee8cf --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -0,0 +1,153 @@ +//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64PBQPRegAlloc.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-subtarget" + +#define GET_SUBTARGETINFO_CTOR +#define GET_SUBTARGETINFO_TARGET_DESC +#include "AArch64GenSubtargetInfo.inc" + +static cl::opt<bool> +EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " + "converter pass"), cl::init(true), cl::Hidden); + +// If OS supports TBI, use this flag to enable it. +static cl::opt<bool> +UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " + "an address is ignored"), cl::init(false), cl::Hidden); + +AArch64Subtarget & +AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { + // Determine default and user-specified characteristics + + if (CPUString.empty()) + CPUString = "generic"; + + ParseSubtargetFeatures(CPUString, FS); + return *this; +} + +AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, + const std::string &FS, + const TargetMachine &TM, bool LittleEndian) + : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), + HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), + CPUString(CPU), TargetTriple(TT), FrameLowering(), + InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), + TLInfo(TM, *this) {} + +/// ClassifyGlobalReference - Find the target operand flags that describe +/// how a global value should be referenced for the current subtarget. +unsigned char +AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const { + bool isDef = GV->isStrongDefinitionForLinker(); + + // MachO large model always goes via a GOT, simply to get a single 8-byte + // absolute relocation on all global addresses. + if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) + return AArch64II::MO_GOT; + + // The small code mode's direct accesses use ADRP, which cannot necessarily + // produce the value 0 (if the code is above 4GB). + if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) { + // In PIC mode use the GOT, but in absolute mode use a constant pool load. + if (TM.getRelocationModel() == Reloc::Static) + return AArch64II::MO_CONSTPOOL; + else + return AArch64II::MO_GOT; + } + + // If symbol visibility is hidden, the extra load is not needed if + // the symbol is definitely defined in the current translation unit. + + // The handling of non-hidden symbols in PIC mode is rather target-dependent: + // + On MachO, if the symbol is defined in this module the GOT can be + // skipped. + // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually + // defined could end up in unexpected places. Use a GOT. + if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { + if (isTargetMachO()) + return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; + else + // No need to go through the GOT for local symbols on ELF. + return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; + } + + return AArch64II::MO_NO_FLAG; +} + +/// This function returns the name of a function which has an interface +/// like the non-standard bzero function, if such a function exists on +/// the current subtarget and it is considered prefereable over +/// memset with zero passed as the second argument. Otherwise it +/// returns null. +const char *AArch64Subtarget::getBZeroEntry() const { + // Prefer bzero on Darwin only. + if(isTargetDarwin()) + return "bzero"; + + return nullptr; +} + +void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const { + // LNT run (at least on Cyclone) showed reasonably significant gains for + // bi-directional scheduling. 253.perlbmk. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + // Enabling or Disabling the latency heuristic is a close call: It seems to + // help nearly no benchmark on out-of-order architectures, on the other hand + // it regresses register pressure on a few benchmarking. + if (isCyclone()) + Policy.DisableLatencyHeuristic = true; +} + +bool AArch64Subtarget::enableEarlyIfConversion() const { + return EnableEarlyIfConvert; +} + +bool AArch64Subtarget::supportsAddressTopByteIgnored() const { + if (!UseAddressTopByteIgnored) + return false; + + if (TargetTriple.isiOS()) { + unsigned Major, Minor, Micro; + TargetTriple.getiOSVersion(Major, Minor, Micro); + return Major >= 8; + } + + return false; +} + +std::unique_ptr<PBQPRAConstraint> +AArch64Subtarget::getCustomPBQPConstraints() const { + if (!isCortexA57()) + return nullptr; + + return llvm::make_unique<A57ChainingConstraint>(); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h new file mode 100644 index 0000000..1b8b9b2 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -0,0 +1,179 @@ +//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the AArch64 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H + +#include "AArch64FrameLowering.h" +#include "AArch64ISelLowering.h" +#include "AArch64InstrInfo.h" +#include "AArch64RegisterInfo.h" +#include "AArch64SelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "AArch64GenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; +class Triple; + +class AArch64Subtarget : public AArch64GenSubtargetInfo { +protected: + enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone}; + + /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. + ARMProcFamilyEnum ARMProcFamily; + + bool HasV8_1aOps; + bool HasV8_2aOps; + + bool HasFPARMv8; + bool HasNEON; + bool HasCrypto; + bool HasCRC; + bool HasPerfMon; + bool HasFullFP16; + bool HasSPE; + + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. + bool HasZeroCycleRegMove; + + // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. + bool HasZeroCycleZeroing; + + // StrictAlign - Disallow unaligned memory accesses. + bool StrictAlign; + + // ReserveX18 - X18 is not available as a general purpose register. + bool ReserveX18; + + bool IsLittle; + + /// CPUString - String name of used CPU. + std::string CPUString; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + AArch64FrameLowering FrameLowering; + AArch64InstrInfo InstrInfo; + AArch64SelectionDAGInfo TSInfo; + AArch64TargetLowering TLInfo; +private: + /// initializeSubtargetDependencies - Initializes using CPUString and the + /// passed in feature string so that we can use initializer lists for + /// subtarget initialization. + AArch64Subtarget &initializeSubtargetDependencies(StringRef FS); + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + AArch64Subtarget(const Triple &TT, const std::string &CPU, + const std::string &FS, const TargetMachine &TM, + bool LittleEndian); + + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const AArch64FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AArch64TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const AArch64RegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } + const Triple &getTargetTriple() const { return TargetTriple; } + bool enableMachineScheduler() const override { return true; } + bool enablePostRAScheduler() const override { + return isGeneric() || isCortexA53() || isCortexA57(); + } + + bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } + + bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } + + bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + + bool requiresStrictAlign() const { return StrictAlign; } + + bool isX18Reserved() const { return ReserveX18; } + bool hasFPARMv8() const { return HasFPARMv8; } + bool hasNEON() const { return HasNEON; } + bool hasCrypto() const { return HasCrypto; } + bool hasCRC() const { return HasCRC; } + /// CPU has TBI (top byte of addresses is ignored during HW address + /// translation) and OS enables it. + bool supportsAddressTopByteIgnored() const; + + bool hasPerfMon() const { return HasPerfMon; } + bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } + + bool isLittleEndian() const { return IsLittle; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetIOS() const { return TargetTriple.isiOS(); } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } + + bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + bool isGeneric() const { return CPUString == "generic"; } + bool isCyclone() const { return CPUString == "cyclone"; } + bool isCortexA57() const { return CPUString == "cortex-a57"; } + bool isCortexA53() const { return CPUString == "cortex-a53"; } + + bool useAA() const override { return isCortexA53(); } + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { return 64; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// ClassifyGlobalReference - Find the target operand flags that describe + /// how a global value should be referenced for the current subtarget. + unsigned char ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const; + + /// This function returns the name of a function which has an interface + /// like the non-standard bzero function, if such a function exists on + /// the current subtarget and it is considered prefereable over + /// memset with zero passed as the second argument. Otherwise it + /// returns null. + const char *getBZeroEntry() const; + + void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const override; + + bool enableEarlyIfConversion() const override; + + std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp new file mode 100644 index 0000000..c52c554 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -0,0 +1,331 @@ +//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "AArch64TargetObjectFile.h" +#include "AArch64TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +static cl::opt<bool> +EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> EnableMCR("aarch64-mcr", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar" + " integer instructions"), cl::init(false), cl::Hidden); + +static cl::opt<bool> +EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote " + "constant pass"), cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the" + " linker optimization hints (LOH)"), cl::init(true), + cl::Hidden); + +static cl::opt<bool> +EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden, + cl::desc("Enable the pass that removes dead" + " definitons and replaces stores to" + " them with stores to the zero" + " register"), + cl::init(true)); + +static cl::opt<bool> +EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair" + " optimization pass"), cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden, + cl::desc("Run SimplifyCFG after expanding atomic operations" + " to make use of cmpxchg flow-based information"), + cl::init(true)); + +static cl::opt<bool> +EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(true)); + +static cl::opt<bool> +EnableCondOpt("aarch64-condopt", + cl::desc("Enable the condition optimizer pass"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden, + cl::desc("Work around Cortex-A53 erratum 835769"), + cl::init(false)); + +static cl::opt<bool> +EnableGEPOpt("aarch64-gep-opt", cl::Hidden, + cl::desc("Enable optimizations on complex GEPs"), + cl::init(false)); + +// FIXME: Unify control over GlobalMerge. +static cl::opt<cl::boolOrDefault> +EnableGlobalMerge("aarch64-global-merge", cl::Hidden, + cl::desc("Enable the global merge pass")); + +extern "C" void LLVMInitializeAArch64Target() { + // Register the target. + RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget); + RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget); + RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target); +} + +//===----------------------------------------------------------------------===// +// AArch64 Lowering public interface. +//===----------------------------------------------------------------------===// +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) + return make_unique<AArch64_MachoTargetObjectFile>(); + + return make_unique<AArch64_ELFTargetObjectFile>(); +} + +// Helper function to build a DataLayout string +static std::string computeDataLayout(const Triple &TT, bool LittleEndian) { + if (TT.isOSBinFormatMachO()) + return "e-m:o-i64:64-i128:128-n32:64-S128"; + if (LittleEndian) + return "e-m:e-i64:64-i128:128-n32:64-S128"; + return "E-m:e-i64:64-i128:128-n32:64-S128"; +} + +/// TargetMachine ctor - Create an AArch64 architecture model. +/// +AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, + bool LittleEndian) + // This nested ternary is horrible, but DL needs to be properly + // initialized before TLInfo is constructed. + : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS, + Options, RM, CM, OL), + TLOF(createTLOF(getTargetTriple())), + isLittle(LittleEndian) { + initAsmInfo(); +} + +AArch64TargetMachine::~AArch64TargetMachine() {} + +const AArch64Subtarget * +AArch64TargetMachine::getSubtargetImpl(const Function &F) const { + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, + isLittle); + } + return I.get(); +} + +void AArch64leTargetMachine::anchor() { } + +AArch64leTargetMachine::AArch64leTargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + +void AArch64beTargetMachine::anchor() { } + +AArch64beTargetMachine::AArch64beTargetMachine( + const Target &T, const Triple &TT, StringRef CPU, StringRef FS, + const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + +namespace { +/// AArch64 Code Generator Pass Configuration Options. +class AArch64PassConfig : public TargetPassConfig { +public: + AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) { + if (TM->getOptLevel() != CodeGenOpt::None) + substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); + } + + AArch64TargetMachine &getAArch64TargetMachine() const { + return getTM<AArch64TargetMachine>(); + } + + void addIRPasses() override; + bool addPreISel() override; + bool addInstSelector() override; + bool addILPOpts() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; +} // namespace + +TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(AArch64TTIImpl(this, F)); + }); +} + +TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { + return new AArch64PassConfig(this, PM); +} + +void AArch64PassConfig::addIRPasses() { + // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg + // ourselves. + addPass(createAtomicExpandPass(TM)); + + // Cmpxchg instructions are often used with a subsequent comparison to + // determine whether it succeeded. We can exploit existing control-flow in + // ldrex/strex loops to simplify this, but it needs tidying up. + if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy) + addPass(createCFGSimplificationPass()); + + TargetPassConfig::addIRPasses(); + + // Match interleaved memory accesses to ldN/stN intrinsics. + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createInterleavedAccessPass(TM)); + + if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { + // Call SeparateConstOffsetFromGEP pass to extract constants within indices + // and lower a GEP with multiple indices to either arithmetic operations or + // multiple GEPs with single index. + addPass(createSeparateConstOffsetFromGEPPass(TM, true)); + // Call EarlyCSE pass to find and remove subexpressions in the lowered + // result. + addPass(createEarlyCSEPass()); + // Do loop invariant code motion in case part of the lowered result is + // invariant. + addPass(createLICMPass()); + } +} + +// Pass Pipeline Configuration +bool AArch64PassConfig::addPreISel() { + // Run promote constant before global merge, so that the promoted constants + // get a chance to be merged + if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant) + addPass(createAArch64PromoteConstantPass()); + // FIXME: On AArch64, this depends on the type. + // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). + // and the offset has to be a multiple of the related size in bytes. + if ((TM->getOptLevel() != CodeGenOpt::None && + EnableGlobalMerge == cl::BOU_UNSET) || + EnableGlobalMerge == cl::BOU_TRUE) { + bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && + (EnableGlobalMerge == cl::BOU_UNSET); + addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize)); + } + + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64AddressTypePromotionPass()); + + return false; +} + +bool AArch64PassConfig::addInstSelector() { + addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel())); + + // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many + // references to _TLS_MODULE_BASE_ as possible. + if (TM->getTargetTriple().isOSBinFormatELF() && + getOptLevel() != CodeGenOpt::None) + addPass(createAArch64CleanupLocalDynamicTLSPass()); + + return false; +} + +bool AArch64PassConfig::addILPOpts() { + if (EnableCondOpt) + addPass(createAArch64ConditionOptimizerPass()); + if (EnableCCMP) + addPass(createAArch64ConditionalCompares()); + if (EnableMCR) + addPass(&MachineCombinerID); + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); + if (EnableStPairSuppress) + addPass(createAArch64StorePairSuppressPass()); + return true; +} + +void AArch64PassConfig::addPreRegAlloc() { + // Use AdvSIMD scalar instructions whenever profitable. + if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) { + addPass(createAArch64AdvSIMDScalar()); + // The AdvSIMD pass may produce copies that can be rewritten to + // be register coaleascer friendly. + addPass(&PeepholeOptimizerID); + } +} + +void AArch64PassConfig::addPostRegAlloc() { + // Change dead register definitions to refer to the zero register. + if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) + addPass(createAArch64DeadRegisterDefinitions()); + if (TM->getOptLevel() != CodeGenOpt::None && usingDefaultRegAlloc()) + // Improve performance for some FP/SIMD code for A57. + addPass(createAArch64A57FPLoadBalancing()); +} + +void AArch64PassConfig::addPreSched2() { + // Expand some pseudo instructions to allow proper scheduling. + addPass(createAArch64ExpandPseudoPass()); + // Use load/store pair instructions when possible. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); +} + +void AArch64PassConfig::addPreEmitPass() { + if (EnableA53Fix835769) + addPass(createAArch64A53Fix835769()); + // Relax conditional branch instructions if they're otherwise out of + // range of their destination. + addPass(createAArch64BranchRelaxation()); + if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && + TM->getTargetTriple().isOSBinFormatMachO()) + addPass(createAArch64CollectLOHPass()); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h new file mode 100644 index 0000000..8d49a29 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -0,0 +1,76 @@ +//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the AArch64 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AArch64TargetMachine : public LLVMTargetMachine { +protected: + std::unique_ptr<TargetLoweringObjectFile> TLOF; + mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap; + +public: + AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool IsLittleEndian); + + ~AArch64TargetMachine() override; + const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; + + // Pass Pipeline Configuration + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + /// \brief Get the TargetIRAnalysis for this target. + TargetIRAnalysis getTargetIRAnalysis() override; + + TargetLoweringObjectFile* getObjFileLowering() const override { + return TLOF.get(); + } + +private: + bool isLittle; +}; + +// AArch64leTargetMachine - AArch64 little endian target machine. +// +class AArch64leTargetMachine : public AArch64TargetMachine { + virtual void anchor(); +public: + AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +// AArch64beTargetMachine - AArch64 big endian target machine. +// +class AArch64beTargetMachine : public AArch64TargetMachine { + virtual void anchor(); +public: + AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp new file mode 100644 index 0000000..18ee4a9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -0,0 +1,73 @@ +//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetObjectFile.h" +#include "AArch64TargetMachine.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/Dwarf.h" +using namespace llvm; +using namespace dwarf; + +void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile() + : TargetLoweringObjectFileMachO() { + SupportGOTPCRelWithOffset = false; +} + +const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( + const GlobalValue *GV, unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, MachineModuleInfo *MMI, + MCStreamer &Streamer) const { + // On Darwin, we can reference dwarf symbols with foo@GOT-., which + // is an indirect pc-relative reference. The default implementation + // won't reference using the GOT, so we need this target-specific + // version. + if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { + const MCSymbol *Sym = TM.getSymbol(GV, Mang); + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbol *PCSym = getContext().createTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); + return MCBinaryExpr::createSub(Res, PC, getContext()); + } + + return TargetLoweringObjectFileMachO::getTTypeGlobalReference( + GV, Encoding, Mang, TM, MMI, Streamer); +} + +MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol( + const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, + MachineModuleInfo *MMI) const { + return TM.getSymbol(GV, Mang); +} + +const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { + assert((Offset+MV.getConstant() == 0) && + "Arch64 does not support GOT PC rel with extra offset"); + // On ARM64 Darwin, we can reference symbols with foo@GOT-., which + // is an indirect pc-relative reference. + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbol *PCSym = getContext().createTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); + return MCBinaryExpr::createSub(Res, PC, getContext()); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h new file mode 100644 index 0000000..d41f445 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -0,0 +1,47 @@ +//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +namespace llvm { +class AArch64TargetMachine; + +/// This implementation is used for AArch64 ELF targets (Linux in particular). +class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; +}; + +/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. +class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { +public: + AArch64_MachoTargetObjectFile(); + + const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, + unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + + MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM, + MachineModuleInfo *MMI) const override; + + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp new file mode 100644 index 0000000..9af0e64 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -0,0 +1,573 @@ +//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetTransformInfo.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +#include <algorithm> +using namespace llvm; + +#define DEBUG_TYPE "aarch64tti" + +/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// method might only calculate a fraction of a larger immediate. Therefore it +/// is valid to return a cost of ZERO. +int AArch64TTIImpl::getIntImmCost(int64_t Val) { + // Check if the immediate can be encoded within an instruction. + if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) + return 0; + + if (Val < 0) + Val = ~Val; + + // Calculate how many moves we will need to materialize this constant. + unsigned LZ = countLeadingZeros((uint64_t)Val); + return (64 - LZ + 15) / 16; +} + +/// \brief Calculate the cost of materializing the given constant. +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + // Sign-extend all constants to a multiple of 64-bit. + APInt ImmVal = Imm; + if (BitSize & 0x3f) + ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + + // Split the constant into 64-bit chunks and calculate the cost for each + // chunk. + int Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { + APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); + int64_t Val = Tmp.getSExtValue(); + Cost += getIntImmCost(Val); + } + // We need at least one instruction to materialze the constant. + return std::max(1, Cost); +} + +int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TTI::TCC_Free; + + unsigned ImmIdx = ~0U; + switch (Opcode) { + default: + return TTI::TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. + if (Idx == 0) + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; + case Instruction::Store: + ImmIdx = 0; + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + ImmIdx = 1; + break; + // Always return TCC_Free for the shift value of a shift instruction. + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TTI::TCC_Free; + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::IntToPtr: + case Instruction::PtrToInt: + case Instruction::BitCast: + case Instruction::PHI: + case Instruction::Call: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + break; + } + + if (Idx == ImmIdx) { + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<int>(TTI::TCC_Free) + : Cost; + } + return AArch64TTIImpl::getIntImmCost(Imm, Ty); +} + +int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TTI::TCC_Free; + + switch (IID) { + default: + return TTI::TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + if (Idx == 1) { + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<int>(TTI::TCC_Free) + : Cost; + } + break; + case Intrinsic::experimental_stackmap: + if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + } + return AArch64TTIImpl::getIntImmCost(Imm, Ty); +} + +TargetTransformInfo::PopcntSupportKind +AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + if (TyWidth == 32 || TyWidth == 64) + return TTI::PSK_FastHardware; + // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. + return TTI::PSK_Software; +} + +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + static const TypeConversionCostTblEntry + ConversionTbl[] = { + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + + // The number of shll instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + + // LowerVectorINT_TO_FP: + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + + // Complex: to v2f32 + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, + + // Complex: to v4f32 + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + + // Complex: to v8f32 + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + + // Complex: to v16f32 + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + + // Complex: to v2f64 + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, + + + // LowerVectorFP_TO_INT + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + + // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, + + // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, + + // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, + }; + + if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + +int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // The element at index zero is already inside the vector. + if (Index == 0) + return 0; + } + + // All other insert/extracts cost this much. + return 3; +} + +int AArch64TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + if (ISD == ISD::SDIV && + Opd2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On AArch64, scalar signed division by constants power-of-two are + // normally expanded to the sequence ADD + CMP + SELECT + SRA. + // The OperandValue properties many not be same as that of previous + // operation; conservatively assume OP_None. + int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + return Cost; + } + + switch (ISD) { + default: + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); + case ISD::ADD: + case ISD::MUL: + case ISD::XOR: + case ISD::OR: + case ISD::AND: + // These nodes are marked as 'custom' for combining purposes only. + // We know that they are legal. See LowerAdd in ISelLowering. + return 1 * LT.first; + } +} + +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; +} + +int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + // We don't lower some vector selects well that are wider than the register + // width. + if (ValTy->isVectorTy() && ISD == ISD::SELECT) { + // We would need this many instructions to hide the scalarization happening. + const int AmortizationCost = 20; + static const TypeConversionCostTblEntry + VectorSelectTbl[] = { + { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, + { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, + { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, + { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, + { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } + }; + + EVT SelCondTy = TLI->getValueType(DL, CondTy); + EVT SelValTy = TLI->getValueType(DL, ValTy); + if (SelCondTy.isSimple() && SelValTy.isSimple()) { + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; + } + } + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + + if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && + Src->getVectorElementType()->isIntegerTy(64)) { + // Unaligned stores are extremely inefficient. We don't split + // unaligned v2i64 stores because the negative impact that has shown in + // practice on inlined memcpy code. + // We make v2i64 stores expensive so that we will only vectorize if there + // are 6 other instructions getting vectorized. + int AmortizationCost = 6; + + return LT.first * 2 * AmortizationCost; + } + + if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && + Src->getVectorNumElements() < 8) { + // We scalarize the loads/stores because there is not v.4b register and we + // have to promote the elements to v.4h. + unsigned NumVecElts = Src->getVectorNumElements(); + unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; + // We generate 2 instructions per vector element. + return NumVectorizableInstsToAmortize * NumVecElts * 2; + } + + return LT.first; +} + +int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(Factor >= 2 && "Invalid interleave factor"); + assert(isa<VectorType>(VecTy) && "Expect a vector type"); + + if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + unsigned NumElts = VecTy->getVectorNumElements(); + Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + + // ldN/stN only support legal vector types of size 64 or 128 in bits. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) + return Factor; + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); +} + +int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { + int Cost = 0; + for (auto *I : Tys) { + if (!I->isVectorTy()) + continue; + if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) + Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) + + getMemoryOpCost(Instruction::Load, I, 128, 0); + } + return Cost; +} + +unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { + if (ST->isCortexA57()) + return 4; + return 2; +} + +void AArch64TTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + // Enable partial unrolling and runtime unrolling. + BaseT::getUnrollingPreferences(L, UP); + + // For inner loop, it is more likely to be a hot one, and the runtime check + // can be promoted out from LICM pass, so the overhead is less, let's try + // a larger threshold to unroll more loops. + if (L->getLoopDepth() > 1) + UP.PartialThreshold *= 2; + + // Disable partial & runtime unrolling on -Os. + UP.PartialOptSizeThreshold = 0; +} + +Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType) { + switch (Inst->getIntrinsicID()) { + default: + return nullptr; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: { + // Create a struct type + StructType *ST = dyn_cast<StructType>(ExpectedType); + if (!ST) + return nullptr; + unsigned NumElts = Inst->getNumArgOperands() - 1; + if (ST->getNumElements() != NumElts) + return nullptr; + for (unsigned i = 0, e = NumElts; i != e; ++i) { + if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) + return nullptr; + } + Value *Res = UndefValue::get(ExpectedType); + IRBuilder<> Builder(Inst); + for (unsigned i = 0, e = NumElts; i != e; ++i) { + Value *L = Inst->getArgOperand(i); + Res = Builder.CreateInsertValue(Res, L, i); + } + return Res; + } + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + if (Inst->getType() == ExpectedType) + return Inst; + return nullptr; + } +} + +bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) { + switch (Inst->getIntrinsicID()) { + default: + break; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + Info.ReadMem = true; + Info.WriteMem = false; + Info.IsSimple = true; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(0); + break; + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + Info.ReadMem = false; + Info.WriteMem = true; + Info.IsSimple = true; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); + break; + } + + switch (Inst->getIntrinsicID()) { + default: + return false; + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_st2: + Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_st3: + Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; + break; + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_st4: + Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; + break; + } + return true; +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h new file mode 100644 index 0000000..ec58c4f --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -0,0 +1,135 @@ +//===-- AArch64TargetTransformInfo.h - AArch64 specific TTI -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AArch64 target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" +#include <algorithm> + +namespace llvm { + +class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { + typedef BasicTTIImplBase<AArch64TTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AArch64Subtarget *ST; + const AArch64TargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const AArch64Subtarget *getST() const { return ST; } + const AArch64TargetLowering *getTLI() const { return TLI; } + + enum MemIntrinsicType { + VECTOR_LDST_TWO_ELEMENTS, + VECTOR_LDST_THREE_ELEMENTS, + VECTOR_LDST_FOUR_ELEMENTS + }; + +public: + explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AArch64TTIImpl(const AArch64TTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AArch64TTIImpl(AArch64TTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + /// \name Scalar TTI Implementations + /// @{ + + using BaseT::getIntImmCost; + int getIntImmCost(int64_t Val); + int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + bool enableInterleavedAccessVectorization() { return true; } + + unsigned getNumberOfRegisters(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 32; + return 0; + } + return 31; + } + + unsigned getRegisterBitWidth(bool Vector) { + if (Vector) { + if (ST->hasNEON()) + return 128; + return 0; + } + return 64; + } + + unsigned getMaxInterleaveFactor(unsigned VF); + + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + + int getAddressComputationCost(Type *Ty, bool IsComplex); + + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + + int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, + Type *ExpectedType); + + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); + + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp new file mode 100644 index 0000000..394c8e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -0,0 +1,4615 @@ +//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64TargetStreamer.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <cstdio> +using namespace llvm; + +namespace { + +class AArch64Operand; + +class AArch64AsmParser : public MCTargetAsmParser { +private: + StringRef Mnemonic; ///< Instruction mnemonic. + + // Map of register aliases registers via the .req directive. + StringMap<std::pair<bool, unsigned> > RegisterReqs; + + AArch64TargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast<AArch64TargetStreamer &>(TS); + } + + SMLoc getLoc() const { return getParser().getTok().getLoc(); } + + bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); + AArch64CC::CondCode parseCondCodeString(StringRef Cond); + bool parseCondCode(OperandVector &Operands, bool invertCondCode); + unsigned matchRegisterNameAlias(StringRef Name, bool isVector); + int tryParseRegister(); + int tryMatchVectorRegister(StringRef &Kind, bool expected); + bool parseRegister(OperandVector &Operands); + bool parseSymbolicImmVal(const MCExpr *&ImmVal); + bool parseVectorList(OperandVector &Operands); + bool parseOperand(OperandVector &Operands, bool isCondCode, + bool invertCondCode); + + void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); } + bool showMatchError(SMLoc Loc, unsigned ErrCode); + + bool parseDirectiveWord(unsigned Size, SMLoc L); + bool parseDirectiveInst(SMLoc L); + + bool parseDirectiveTLSDescCall(SMLoc L); + + bool parseDirectiveLOH(StringRef LOH, SMLoc L); + bool parseDirectiveLtorg(SMLoc L); + + bool parseDirectiveReq(StringRef Name, SMLoc L); + bool parseDirectiveUnreq(SMLoc L); + + bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc); + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; +/// @name Auto-generated Match Functions +/// { + +#define GET_ASSEMBLER_HEADER +#include "AArch64GenAsmMatcher.inc" + + /// } + + OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); + OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); + OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); + OperandMatchResultTy tryParseSysReg(OperandVector &Operands); + OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); + OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); + OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); + OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); + OperandMatchResultTy tryParseFPImm(OperandVector &Operands); + OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands); + OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands); + bool tryParseVectorRegister(OperandVector &Operands); + OperandMatchResultTy tryParseGPRSeqPair(OperandVector &Operands); + +public: + enum AArch64MatchResultTy { + Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "AArch64GenAsmMatcher.inc" + }; + AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI) { + MCAsmParserExtension::Initialize(Parser); + MCStreamer &S = getParser().getStreamer(); + if (S.getTargetStreamer() == nullptr) + new AArch64TargetStreamer(S); + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + } + + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + bool ParseDirective(AsmToken DirectiveID) override; + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; + + static bool classifySymbolRef(const MCExpr *Expr, + AArch64MCExpr::VariantKind &ELFRefKind, + MCSymbolRefExpr::VariantKind &DarwinRefKind, + int64_t &Addend); +}; +} // end anonymous namespace + +namespace { + +/// AArch64Operand - Instances of this class represent a parsed AArch64 machine +/// instruction. +class AArch64Operand : public MCParsedAsmOperand { +private: + enum KindTy { + k_Immediate, + k_ShiftedImm, + k_CondCode, + k_Register, + k_VectorList, + k_VectorIndex, + k_Token, + k_SysReg, + k_SysCR, + k_Prefetch, + k_ShiftExtend, + k_FPImm, + k_Barrier, + k_PSBHint, + } Kind; + + SMLoc StartLoc, EndLoc; + + struct TokOp { + const char *Data; + unsigned Length; + bool IsSuffix; // Is the operand actually a suffix on the mnemonic. + }; + + struct RegOp { + unsigned RegNum; + bool isVector; + }; + + struct VectorListOp { + unsigned RegNum; + unsigned Count; + unsigned NumElements; + unsigned ElementKind; + }; + + struct VectorIndexOp { + unsigned Val; + }; + + struct ImmOp { + const MCExpr *Val; + }; + + struct ShiftedImmOp { + const MCExpr *Val; + unsigned ShiftAmount; + }; + + struct CondCodeOp { + AArch64CC::CondCode Code; + }; + + struct FPImmOp { + unsigned Val; // Encoded 8-bit representation. + }; + + struct BarrierOp { + unsigned Val; // Not the enum since not all values have names. + const char *Data; + unsigned Length; + }; + + struct SysRegOp { + const char *Data; + unsigned Length; + uint32_t MRSReg; + uint32_t MSRReg; + uint32_t PStateField; + }; + + struct SysCRImmOp { + unsigned Val; + }; + + struct PrefetchOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + + struct ShiftExtendOp { + AArch64_AM::ShiftExtendType Type; + unsigned Amount; + bool HasExplicitAmount; + }; + + struct ExtendOp { + unsigned Val; + }; + + union { + struct TokOp Tok; + struct RegOp Reg; + struct VectorListOp VectorList; + struct VectorIndexOp VectorIndex; + struct ImmOp Imm; + struct ShiftedImmOp ShiftedImm; + struct CondCodeOp CondCode; + struct FPImmOp FPImm; + struct BarrierOp Barrier; + struct SysRegOp SysReg; + struct SysCRImmOp SysCRImm; + struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; + struct ShiftExtendOp ShiftExtend; + }; + + // Keep the MCContext around as the MCExprs may need manipulated during + // the add<>Operands() calls. + MCContext &Ctx; + +public: + AArch64Operand(KindTy K, MCContext &Ctx) : Kind(K), Ctx(Ctx) {} + + AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case k_Token: + Tok = o.Tok; + break; + case k_Immediate: + Imm = o.Imm; + break; + case k_ShiftedImm: + ShiftedImm = o.ShiftedImm; + break; + case k_CondCode: + CondCode = o.CondCode; + break; + case k_FPImm: + FPImm = o.FPImm; + break; + case k_Barrier: + Barrier = o.Barrier; + break; + case k_Register: + Reg = o.Reg; + break; + case k_VectorList: + VectorList = o.VectorList; + break; + case k_VectorIndex: + VectorIndex = o.VectorIndex; + break; + case k_SysReg: + SysReg = o.SysReg; + break; + case k_SysCR: + SysCRImm = o.SysCRImm; + break; + case k_Prefetch: + Prefetch = o.Prefetch; + break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; + case k_ShiftExtend: + ShiftExtend = o.ShiftExtend; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + + StringRef getToken() const { + assert(Kind == k_Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + bool isTokenSuffix() const { + assert(Kind == k_Token && "Invalid access!"); + return Tok.IsSuffix; + } + + const MCExpr *getImm() const { + assert(Kind == k_Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getShiftedImmVal() const { + assert(Kind == k_ShiftedImm && "Invalid access!"); + return ShiftedImm.Val; + } + + unsigned getShiftedImmShift() const { + assert(Kind == k_ShiftedImm && "Invalid access!"); + return ShiftedImm.ShiftAmount; + } + + AArch64CC::CondCode getCondCode() const { + assert(Kind == k_CondCode && "Invalid access!"); + return CondCode.Code; + } + + unsigned getFPImm() const { + assert(Kind == k_FPImm && "Invalid access!"); + return FPImm.Val; + } + + unsigned getBarrier() const { + assert(Kind == k_Barrier && "Invalid access!"); + return Barrier.Val; + } + + StringRef getBarrierName() const { + assert(Kind == k_Barrier && "Invalid access!"); + return StringRef(Barrier.Data, Barrier.Length); + } + + unsigned getReg() const override { + assert(Kind == k_Register && "Invalid access!"); + return Reg.RegNum; + } + + unsigned getVectorListStart() const { + assert(Kind == k_VectorList && "Invalid access!"); + return VectorList.RegNum; + } + + unsigned getVectorListCount() const { + assert(Kind == k_VectorList && "Invalid access!"); + return VectorList.Count; + } + + unsigned getVectorIndex() const { + assert(Kind == k_VectorIndex && "Invalid access!"); + return VectorIndex.Val; + } + + StringRef getSysReg() const { + assert(Kind == k_SysReg && "Invalid access!"); + return StringRef(SysReg.Data, SysReg.Length); + } + + unsigned getSysCR() const { + assert(Kind == k_SysCR && "Invalid access!"); + return SysCRImm.Val; + } + + unsigned getPrefetch() const { + assert(Kind == k_Prefetch && "Invalid access!"); + return Prefetch.Val; + } + + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + + StringRef getPrefetchName() const { + assert(Kind == k_Prefetch && "Invalid access!"); + return StringRef(Prefetch.Data, Prefetch.Length); + } + + AArch64_AM::ShiftExtendType getShiftExtendType() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.Type; + } + + unsigned getShiftExtendAmount() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.Amount; + } + + bool hasShiftExtendAmount() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.HasExplicitAmount; + } + + bool isImm() const override { return Kind == k_Immediate; } + bool isMem() const override { return false; } + bool isSImm9() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -256 && Val < 256); + } + bool isSImm7s4() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -256 && Val <= 252 && (Val & 3) == 0); + } + bool isSImm7s8() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -512 && Val <= 504 && (Val & 7) == 0); + } + bool isSImm7s16() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0); + } + + bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const { + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, + Addend)) { + // If we don't understand the expression, assume the best and + // let the fixup and relocation code deal with it. + return true; + } + + if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) { + // Note that we don't range-check the addend. It's adjusted modulo page + // size when converted, so there is no "out of range" condition when using + // @pageoff. + return Addend >= 0 && (Addend % Scale) == 0; + } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) { + // @gotpageoff/@tlvppageoff can only be used directly, not with an addend. + return Addend == 0; + } + + return false; + } + + template <int Scale> bool isUImm12Offset() const { + if (!isImm()) + return false; + + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return isSymbolicUImm12Offset(getImm(), Scale); + + int64_t Val = MCE->getValue(); + return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; + } + + bool isImm0_1() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 2); + } + bool isImm0_7() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 8); + } + bool isImm1_8() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val > 0 && Val < 9); + } + bool isImm0_15() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 16); + } + bool isImm1_16() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val > 0 && Val < 17); + } + bool isImm0_31() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 32); + } + bool isImm1_31() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 32); + } + bool isImm1_32() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 33); + } + bool isImm0_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 64); + } + bool isImm1_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 64); + } + bool isImm1_64() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 65); + } + bool isImm0_127() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 128); + } + bool isImm0_255() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 256); + } + bool isImm0_65535() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 65536); + } + bool isImm32_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 32 && Val < 64); + } + bool isLogicalImm32() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + if (Val >> 32 != 0 && Val >> 32 != ~0LL) + return false; + Val &= 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); + } + bool isLogicalImm64() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64); + } + bool isLogicalImm32Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + return AArch64_AM::isLogicalImmediate(Val, 32); + } + bool isLogicalImm64Not() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(~MCE->getValue(), 64); + } + bool isShiftedImm() const { return Kind == k_ShiftedImm; } + bool isAddSubImm() const { + if (!isShiftedImm() && !isImm()) + return false; + + const MCExpr *Expr; + + // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'. + if (isShiftedImm()) { + unsigned Shift = ShiftedImm.ShiftAmount; + Expr = ShiftedImm.Val; + if (Shift != 0 && Shift != 12) + return false; + } else { + Expr = getImm(); + } + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, + DarwinRefKind, Addend)) { + return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF + || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF + || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) + || ELFRefKind == AArch64MCExpr::VK_LO12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC + || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 + || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 + || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC + || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12; + } + + // Otherwise it should be a real immediate in range: + const MCConstantExpr *CE = cast<MCConstantExpr>(Expr); + return CE->getValue() >= 0 && CE->getValue() <= 0xfff; + } + bool isAddSubImmNeg() const { + if (!isShiftedImm() && !isImm()) + return false; + + const MCExpr *Expr; + + // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'. + if (isShiftedImm()) { + unsigned Shift = ShiftedImm.ShiftAmount; + Expr = ShiftedImm.Val; + if (Shift != 0 && Shift != 12) + return false; + } else + Expr = getImm(); + + // Otherwise it should be a real negative immediate in range: + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr); + return CE != nullptr && CE->getValue() < 0 && -CE->getValue() <= 0xfff; + } + bool isCondCode() const { return Kind == k_CondCode; } + bool isSIMDImmType10() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue()); + } + bool isBranchTarget26() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); + } + bool isPCRelLabel19() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); + } + bool isBranchTarget14() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2)); + } + + bool + isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const { + if (!isImm()) + return false; + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind, + DarwinRefKind, Addend)) { + return false; + } + if (DarwinRefKind != MCSymbolRefExpr::VK_None) + return false; + + for (unsigned i = 0; i != AllowedModifiers.size(); ++i) { + if (ELFRefKind == AllowedModifiers[i]) + return Addend == 0; + } + + return false; + } + + bool isMovZSymbolG3() const { + return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); + } + + bool isMovZSymbolG2() const { + return isMovWSymbol({AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, + AArch64MCExpr::VK_TPREL_G2, + AArch64MCExpr::VK_DTPREL_G2}); + } + + bool isMovZSymbolG1() const { + return isMovWSymbol({ + AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, + AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1, + AArch64MCExpr::VK_DTPREL_G1, + }); + } + + bool isMovZSymbolG0() const { + return isMovWSymbol({AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, + AArch64MCExpr::VK_TPREL_G0, + AArch64MCExpr::VK_DTPREL_G0}); + } + + bool isMovKSymbolG3() const { + return isMovWSymbol(AArch64MCExpr::VK_ABS_G3); + } + + bool isMovKSymbolG2() const { + return isMovWSymbol(AArch64MCExpr::VK_ABS_G2_NC); + } + + bool isMovKSymbolG1() const { + return isMovWSymbol({AArch64MCExpr::VK_ABS_G1_NC, + AArch64MCExpr::VK_TPREL_G1_NC, + AArch64MCExpr::VK_DTPREL_G1_NC}); + } + + bool isMovKSymbolG0() const { + return isMovWSymbol( + {AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, + AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC}); + } + + template<int RegWidth, int Shift> + bool isMOVZMovAlias() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + + if (RegWidth == 32) + Value &= 0xffffffffULL; + + // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". + if (Value == 0 && Shift != 0) + return false; + + return (Value & ~(0xffffULL << Shift)) == 0; + } + + template<int RegWidth, int Shift> + bool isMOVNMovAlias() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + + // MOVZ takes precedence over MOVN. + for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16) + if ((Value & ~(0xffffULL << MOVZShift)) == 0) + return false; + + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return (Value & ~(0xffffULL << Shift)) == 0; + } + + bool isFPImm() const { return Kind == k_FPImm; } + bool isBarrier() const { return Kind == k_Barrier; } + bool isSysReg() const { return Kind == k_SysReg; } + bool isMRSSystemRegister() const { + if (!isSysReg()) return false; + + return SysReg.MRSReg != -1U; + } + bool isMSRSystemRegister() const { + if (!isSysReg()) return false; + return SysReg.MSRReg != -1U; + } + bool isSystemPStateFieldWithImm0_1() const { + if (!isSysReg()) return false; + return (SysReg.PStateField == AArch64PState::PAN || + SysReg.PStateField == AArch64PState::UAO); + } + bool isSystemPStateFieldWithImm0_15() const { + if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false; + return SysReg.PStateField != -1U; + } + bool isReg() const override { return Kind == k_Register && !Reg.isVector; } + bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } + bool isVectorRegLo() const { + return Kind == k_Register && Reg.isVector && + AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( + Reg.RegNum); + } + bool isGPR32as64() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); + } + bool isWSeqPair() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID].contains( + Reg.RegNum); + } + bool isXSeqPair() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID].contains( + Reg.RegNum); + } + + bool isGPR64sp0() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum); + } + + /// Is this a vector list with the type implicit (presumably attached to the + /// instruction itself)? + template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const { + return Kind == k_VectorList && VectorList.Count == NumRegs && + !VectorList.ElementKind; + } + + template <unsigned NumRegs, unsigned NumElements, char ElementKind> + bool isTypedVectorList() const { + if (Kind != k_VectorList) + return false; + if (VectorList.Count != NumRegs) + return false; + if (VectorList.ElementKind != ElementKind) + return false; + return VectorList.NumElements == NumElements; + } + + bool isVectorIndex1() const { + return Kind == k_VectorIndex && VectorIndex.Val == 1; + } + bool isVectorIndexB() const { + return Kind == k_VectorIndex && VectorIndex.Val < 16; + } + bool isVectorIndexH() const { + return Kind == k_VectorIndex && VectorIndex.Val < 8; + } + bool isVectorIndexS() const { + return Kind == k_VectorIndex && VectorIndex.Val < 4; + } + bool isVectorIndexD() const { + return Kind == k_VectorIndex && VectorIndex.Val < 2; + } + bool isToken() const override { return Kind == k_Token; } + bool isTokenEqual(StringRef Str) const { + return Kind == k_Token && getToken() == Str; + } + bool isSysCR() const { return Kind == k_SysCR; } + bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } + bool isShiftExtend() const { return Kind == k_ShiftExtend; } + bool isShifter() const { + if (!isShiftExtend()) + return false; + + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR || ST == AArch64_AM::ROR || + ST == AArch64_AM::MSL); + } + bool isExtend() const { + if (!isShiftExtend()) + return false; + + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB || + ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH || + ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW || + ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX || + ET == AArch64_AM::LSL) && + getShiftExtendAmount() <= 4; + } + + bool isExtend64() const { + if (!isExtend()) + return false; + // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class). + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX; + } + bool isExtendLSL64() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX || + ET == AArch64_AM::LSL) && + getShiftExtendAmount() <= 4; + } + + template<int Width> bool isMemXExtend() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) && + (getShiftExtendAmount() == Log2_32(Width / 8) || + getShiftExtendAmount() == 0); + } + + template<int Width> bool isMemWExtend() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) && + (getShiftExtendAmount() == Log2_32(Width / 8) || + getShiftExtendAmount() == 0); + } + + template <unsigned width> + bool isArithmeticShifter() const { + if (!isShifter()) + return false; + + // An arithmetic shifter is LSL, LSR, or ASR. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR) && getShiftExtendAmount() < width; + } + + template <unsigned width> + bool isLogicalShifter() const { + if (!isShifter()) + return false; + + // A logical shifter is LSL, LSR, ASR or ROR. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) && + getShiftExtendAmount() < width; + } + + bool isMovImm32Shifter() const { + if (!isShifter()) + return false; + + // A MOVi shifter is LSL of 0, 16, 32, or 48. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + if (ST != AArch64_AM::LSL) + return false; + uint64_t Val = getShiftExtendAmount(); + return (Val == 0 || Val == 16); + } + + bool isMovImm64Shifter() const { + if (!isShifter()) + return false; + + // A MOVi shifter is LSL of 0 or 16. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + if (ST != AArch64_AM::LSL) + return false; + uint64_t Val = getShiftExtendAmount(); + return (Val == 0 || Val == 16 || Val == 32 || Val == 48); + } + + bool isLogicalVecShifter() const { + if (!isShifter()) + return false; + + // A logical vector shifter is a left shift by 0, 8, 16, or 24. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::LSL && + (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24); + } + + bool isLogicalVecHalfWordShifter() const { + if (!isLogicalVecShifter()) + return false; + + // A logical vector shifter is a left shift by 0 or 8. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::LSL && + (Shift == 0 || Shift == 8); + } + + bool isMoveVecShifter() const { + if (!isShiftExtend()) + return false; + + // A logical vector shifter is a left shift by 8 or 16. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::MSL && + (Shift == 8 || Shift == 16); + } + + // Fallback unscaled operands are for aliases of LDR/STR that fall back + // to LDUR/STUR when the offset is not legal for the former but is for + // the latter. As such, in addition to checking for being a legal unscaled + // address, also check that it is not a legal scaled address. This avoids + // ambiguity in the matcher. + template<int Width> + bool isSImm9OffsetFB() const { + return isSImm9() && !isUImm12Offset<Width / 8>(); + } + + bool isAdrpLabel() const { + // Validation was handled during parsing, so we just sanity check that + // something didn't go haywire. + if (!isImm()) + return false; + + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Min = - (4096 * (1LL << (21 - 1))); + int64_t Max = 4096 * ((1LL << (21 - 1)) - 1); + return (Val % 4096) == 0 && Val >= Min && Val <= Max; + } + + return true; + } + + bool isAdrLabel() const { + // Validation was handled during parsing, so we just sanity check that + // something didn't go haywire. + if (!isImm()) + return false; + + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Min = - (1LL << (21 - 1)); + int64_t Max = ((1LL << (21 - 1)) - 1); + return Val >= Min && Val <= Max; + } + + return true; + } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. Null MCExpr = 0. + if (!Expr) + Inst.addOperand(MCOperand::createImm(0)); + else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::createImm(CE->getValue())); + else + Inst.addOperand(MCOperand::createExpr(Expr)); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addGPR32as64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg())); + + const MCRegisterInfo *RI = Ctx.getRegisterInfo(); + uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister( + RI->getEncodingValue(getReg())); + + Inst.addOperand(MCOperand::createReg(Reg)); + } + + void addVectorReg64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg())); + Inst.addOperand(MCOperand::createReg(AArch64::D0 + getReg() - AArch64::Q0)); + } + + void addVectorReg128Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg())); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addVectorRegLoOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + template <unsigned NumRegs> + void addVectorList64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + static const unsigned FirstRegs[] = { AArch64::D0, + AArch64::D0_D1, + AArch64::D0_D1_D2, + AArch64::D0_D1_D2_D3 }; + unsigned FirstReg = FirstRegs[NumRegs - 1]; + + Inst.addOperand( + MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0)); + } + + template <unsigned NumRegs> + void addVectorList128Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + static const unsigned FirstRegs[] = { AArch64::Q0, + AArch64::Q0_Q1, + AArch64::Q0_Q1_Q2, + AArch64::Q0_Q1_Q2_Q3 }; + unsigned FirstReg = FirstRegs[NumRegs - 1]; + + Inst.addOperand( + MCOperand::createReg(FirstReg + getVectorListStart() - AArch64::Q0)); + } + + void addVectorIndex1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndexBOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndexHOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndexSOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addVectorIndexDOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getVectorIndex())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // If this is a pageoff symrefexpr with an addend, adjust the addend + // to be only the page-offset portion. Otherwise, just add the expr + // as-is. + addExpr(Inst, getImm()); + } + + void addAddSubImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + if (isShiftedImm()) { + addExpr(Inst, getShiftedImmVal()); + Inst.addOperand(MCOperand::createImm(getShiftedImmShift())); + } else { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::createImm(0)); + } + } + + void addAddSubImmNegOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + + const MCExpr *MCE = isShiftedImm() ? getShiftedImmVal() : getImm(); + const MCConstantExpr *CE = cast<MCConstantExpr>(MCE); + int64_t Val = -CE->getValue(); + unsigned ShiftAmt = isShiftedImm() ? ShiftedImm.ShiftAmount : 0; + + Inst.addOperand(MCOperand::createImm(Val)); + Inst.addOperand(MCOperand::createImm(ShiftAmt)); + } + + void addCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getCondCode())); + } + + void addAdrpLabelOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + addExpr(Inst, getImm()); + else + Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 12)); + } + + void addAdrLabelOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + template<int Scale> + void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + + if (!MCE) { + Inst.addOperand(MCOperand::createExpr(getImm())); + return; + } + Inst.addOperand(MCOperand::createImm(MCE->getValue() / Scale)); + } + + void addSImm9Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addSImm7s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue() / 4)); + } + + void addSImm7s8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue() / 8)); + } + + void addSImm7s16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16)); + } + + void addImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_7Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_15Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_31Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_31Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm1_64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_127Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_255Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm0_65535Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addImm32_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + + void addLogicalImm32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32); + Inst.addOperand(MCOperand::createImm(encoding)); + } + + void addLogicalImm64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64); + Inst.addOperand(MCOperand::createImm(encoding)); + } + + void addLogicalImm32NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + int64_t Val = ~MCE->getValue() & 0xFFFFFFFF; + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, 32); + Inst.addOperand(MCOperand::createImm(encoding)); + } + + void addLogicalImm64NotOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + uint64_t encoding = + AArch64_AM::encodeLogicalImmediate(~MCE->getValue(), 64); + Inst.addOperand(MCOperand::createImm(encoding)); + } + + void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue()); + Inst.addOperand(MCOperand::createImm(encoding)); + } + + void addBranchTarget26Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2)); + } + + void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2)); + } + + void addBranchTarget14Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2)); + } + + void addFPImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getFPImm())); + } + + void addBarrierOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getBarrier())); + } + + void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.MRSReg)); + } + + void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.MSRReg)); + } + + void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); + } + + void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); + } + + void addSysCROperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getSysCR())); + } + + void addPrefetchOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPrefetch())); + } + + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + + void addShifterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Imm = + AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount()); + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addExtendOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW; + unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount()); + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addExtend64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX; + unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount()); + Inst.addOperand(MCOperand::createImm(Imm)); + } + + void addMemExtendOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX; + Inst.addOperand(MCOperand::createImm(IsSigned)); + Inst.addOperand(MCOperand::createImm(getShiftExtendAmount() != 0)); + } + + // For 8-bit load/store instructions with a register offset, both the + // "DoShift" and "NoShift" variants have a shift of 0. Because of this, + // they're disambiguated by whether the shift was explicit or implicit rather + // than its size. + void addMemExtend8Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX; + Inst.addOperand(MCOperand::createImm(IsSigned)); + Inst.addOperand(MCOperand::createImm(hasShiftExtendAmount())); + } + + template<int Shift> + void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff)); + } + + template<int Shift> + void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + const MCConstantExpr *CE = cast<MCConstantExpr>(getImm()); + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::createImm((~Value >> Shift) & 0xffff)); + } + + void print(raw_ostream &OS) const override; + + static std::unique_ptr<AArch64Operand> + CreateToken(StringRef Str, bool IsSuffix, SMLoc S, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Token, Ctx); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->Tok.IsSuffix = IsSuffix; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateReg(unsigned RegNum, bool isVector, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Register, Ctx); + Op->Reg.RegNum = RegNum; + Op->Reg.isVector = isVector; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateVectorList(unsigned RegNum, unsigned Count, unsigned NumElements, + char ElementKind, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_VectorList, Ctx); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.NumElements = NumElements; + Op->VectorList.ElementKind = ElementKind; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_VectorIndex, Ctx); + Op->VectorIndex.Val = Idx; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Immediate, Ctx); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateShiftedImm(const MCExpr *Val, + unsigned ShiftAmount, + SMLoc S, SMLoc E, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_ShiftedImm, Ctx); + Op->ShiftedImm .Val = Val; + Op->ShiftedImm.ShiftAmount = ShiftAmount; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateCondCode(AArch64CC::CondCode Code, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_CondCode, Ctx); + Op->CondCode.Code = Code; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateFPImm(unsigned Val, SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_FPImm, Ctx); + Op->FPImm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx); + Op->Barrier.Val = Val; + Op->Barrier.Data = Str.data(); + Op->Barrier.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateSysReg(StringRef Str, SMLoc S, + uint32_t MRSReg, + uint32_t MSRReg, + uint32_t PStateField, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_SysReg, Ctx); + Op->SysReg.Data = Str.data(); + Op->SysReg.Length = Str.size(); + Op->SysReg.MRSReg = MRSReg; + Op->SysReg.MSRReg = MSRReg; + Op->SysReg.PStateField = PStateField; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreateSysCR(unsigned Val, SMLoc S, + SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_SysCR, Ctx); + Op->SysCRImm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx); + Op->Prefetch.Val = Val; + Op->Barrier.Data = Str.data(); + Op->Barrier.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static std::unique_ptr<AArch64Operand> + CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, + bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_ShiftExtend, Ctx); + Op->ShiftExtend.Type = ShOp; + Op->ShiftExtend.Amount = Val; + Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } +}; + +} // end anonymous namespace. + +void AArch64Operand::print(raw_ostream &OS) const { + switch (Kind) { + case k_FPImm: + OS << "<fpimm " << getFPImm() << "(" + << AArch64_AM::getFPImmFloat(getFPImm()) << ") >"; + break; + case k_Barrier: { + StringRef Name = getBarrierName(); + if (!Name.empty()) + OS << "<barrier " << Name << ">"; + else + OS << "<barrier invalid #" << getBarrier() << ">"; + break; + } + case k_Immediate: + OS << *getImm(); + break; + case k_ShiftedImm: { + unsigned Shift = getShiftedImmShift(); + OS << "<shiftedimm "; + OS << *getShiftedImmVal(); + OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">"; + break; + } + case k_CondCode: + OS << "<condcode " << getCondCode() << ">"; + break; + case k_Register: + OS << "<register " << getReg() << ">"; + break; + case k_VectorList: { + OS << "<vectorlist "; + unsigned Reg = getVectorListStart(); + for (unsigned i = 0, e = getVectorListCount(); i != e; ++i) + OS << Reg + i << " "; + OS << ">"; + break; + } + case k_VectorIndex: + OS << "<vectorindex " << getVectorIndex() << ">"; + break; + case k_SysReg: + OS << "<sysreg: " << getSysReg() << '>'; + break; + case k_Token: + OS << "'" << getToken() << "'"; + break; + case k_SysCR: + OS << "c" << getSysCR(); + break; + case k_Prefetch: { + StringRef Name = getPrefetchName(); + if (!Name.empty()) + OS << "<prfop " << Name << ">"; + else + OS << "<prfop invalid #" << getPrefetch() << ">"; + break; + } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } + case k_ShiftExtend: { + OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" + << getShiftExtendAmount(); + if (!hasShiftExtendAmount()) + OS << "<imp>"; + OS << '>'; + break; + } + } +} + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +static unsigned matchVectorRegName(StringRef Name) { + return StringSwitch<unsigned>(Name.lower()) + .Case("v0", AArch64::Q0) + .Case("v1", AArch64::Q1) + .Case("v2", AArch64::Q2) + .Case("v3", AArch64::Q3) + .Case("v4", AArch64::Q4) + .Case("v5", AArch64::Q5) + .Case("v6", AArch64::Q6) + .Case("v7", AArch64::Q7) + .Case("v8", AArch64::Q8) + .Case("v9", AArch64::Q9) + .Case("v10", AArch64::Q10) + .Case("v11", AArch64::Q11) + .Case("v12", AArch64::Q12) + .Case("v13", AArch64::Q13) + .Case("v14", AArch64::Q14) + .Case("v15", AArch64::Q15) + .Case("v16", AArch64::Q16) + .Case("v17", AArch64::Q17) + .Case("v18", AArch64::Q18) + .Case("v19", AArch64::Q19) + .Case("v20", AArch64::Q20) + .Case("v21", AArch64::Q21) + .Case("v22", AArch64::Q22) + .Case("v23", AArch64::Q23) + .Case("v24", AArch64::Q24) + .Case("v25", AArch64::Q25) + .Case("v26", AArch64::Q26) + .Case("v27", AArch64::Q27) + .Case("v28", AArch64::Q28) + .Case("v29", AArch64::Q29) + .Case("v30", AArch64::Q30) + .Case("v31", AArch64::Q31) + .Default(0); +} + +static bool isValidVectorKind(StringRef Name) { + return StringSwitch<bool>(Name.lower()) + .Case(".8b", true) + .Case(".16b", true) + .Case(".4h", true) + .Case(".8h", true) + .Case(".2s", true) + .Case(".4s", true) + .Case(".1d", true) + .Case(".2d", true) + .Case(".1q", true) + // Accept the width neutral ones, too, for verbose syntax. If those + // aren't used in the right places, the token operand won't match so + // all will work out. + .Case(".b", true) + .Case(".h", true) + .Case(".s", true) + .Case(".d", true) + // Needed for fp16 scalar pairwise reductions + .Case(".2h", true) + .Default(false); +} + +static void parseValidVectorKind(StringRef Name, unsigned &NumElements, + char &ElementKind) { + assert(isValidVectorKind(Name)); + + ElementKind = Name.lower()[Name.size() - 1]; + NumElements = 0; + + if (Name.size() == 2) + return; + + // Parse the lane count + Name = Name.drop_front(); + while (isdigit(Name.front())) { + NumElements = 10 * NumElements + (Name.front() - '0'); + Name = Name.drop_front(); + } +} + +bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { + StartLoc = getLoc(); + RegNo = tryParseRegister(); + EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1); + return (RegNo == (unsigned)-1); +} + +// Matches a register name or register alias previously defined by '.req' +unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, + bool isVector) { + unsigned RegNum = isVector ? matchVectorRegName(Name) + : MatchRegisterName(Name); + + if (RegNum == 0) { + // Check for aliases registered via .req. Canonicalize to lower case. + // That's more consistent since register names are case insensitive, and + // it's how the original entry was passed in from MC/MCParser/AsmParser. + auto Entry = RegisterReqs.find(Name.lower()); + if (Entry == RegisterReqs.end()) + return 0; + // set RegNum if the match is the right kind of register + if (isVector == Entry->getValue().first) + RegNum = Entry->getValue().second; + } + return RegNum; +} + +/// tryParseRegister - Try to parse a register name. The token must be an +/// Identifier when called, and if it is a register name the token is eaten and +/// the register is added to the operand list. +int AArch64AsmParser::tryParseRegister() { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + std::string lowerCase = Tok.getString().lower(); + unsigned RegNum = matchRegisterNameAlias(lowerCase, false); + // Also handle a few aliases of registers. + if (RegNum == 0) + RegNum = StringSwitch<unsigned>(lowerCase) + .Case("fp", AArch64::FP) + .Case("lr", AArch64::LR) + .Case("x31", AArch64::XZR) + .Case("w31", AArch64::WZR) + .Default(0); + + if (RegNum == 0) + return -1; + + Parser.Lex(); // Eat identifier token. + return RegNum; +} + +/// tryMatchVectorRegister - Try to parse a vector register name with optional +/// kind specifier. If it is a register specifier, eat the token and return it. +int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { + MCAsmParser &Parser = getParser(); + if (Parser.getTok().isNot(AsmToken::Identifier)) { + TokError("vector register expected"); + return -1; + } + + StringRef Name = Parser.getTok().getString(); + // If there is a kind specifier, it's separated from the register name by + // a '.'. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + unsigned RegNum = matchRegisterNameAlias(Head, true); + + if (RegNum) { + if (Next != StringRef::npos) { + Kind = Name.slice(Next, StringRef::npos); + if (!isValidVectorKind(Kind)) { + TokError("invalid vector kind qualifier"); + return -1; + } + } + Parser.Lex(); // Eat the register token. + return RegNum; + } + + if (expected) + TokError("vector register expected"); + return -1; +} + +/// tryParseSysCROperand - Try to parse a system instruction CR operand name. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + StringRef Tok = Parser.getTok().getIdentifier(); + if (Tok[0] != 'c' && Tok[0] != 'C') { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + uint32_t CRNum; + bool BadNum = Tok.drop_front().getAsInteger(10, CRNum); + if (BadNum || CRNum > 15) { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back( + AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext())); + return MatchOperand_Success; +} + +/// tryParsePrefetch - Try to parse a prefetch operand. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + // Either an identifier for named values or a 5-bit immediate. + bool Hash = Tok.is(AsmToken::Hash); + if (Hash || Tok.is(AsmToken::Integer)) { + if (Hash) + Parser.Lex(); // Eat hash token. + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + TokError("immediate value expected for prefetch operand"); + return MatchOperand_ParseFail; + } + unsigned prfop = MCE->getValue(); + if (prfop > 31) { + TokError("prefetch operand out of range, [0,31] expected"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PRFM::PRFMMapper(); + StringRef Name = + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, + S, getContext())); + return MatchOperand_Success; + } + + if (Tok.isNot(AsmToken::Identifier)) { + TokError("pre-fetch hint expected"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PRFM::PRFMMapper(); + unsigned prfop = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("pre-fetch hint expected"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + +/// tryParseAdrpLabel - Parse and validate a source label for the ADRP +/// instruction. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const MCExpr *Expr; + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat hash token. + } + + if (parseSymbolicImmVal(Expr)) + return MatchOperand_ParseFail; + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { + if (DarwinRefKind == MCSymbolRefExpr::VK_None && + ELFRefKind == AArch64MCExpr::VK_INVALID) { + // No modifier was specified at all; this is the syntax for an ELF basic + // ADRP relocation (unfortunately). + Expr = + AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext()); + } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) && + Addend != 0) { + Error(S, "gotpage label reference not allowed an addend"); + return MatchOperand_ParseFail; + } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE && + DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE && + DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && + ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && + ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && + ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { + // The operand must be an @page or @gotpage qualified symbolref. + Error(S, "page or gotpage label reference expected"); + return MatchOperand_ParseFail; + } + } + + // We have either a label reference possibly with addend or an immediate. The + // addend is a raw value here. The linker will adjust it to only reference the + // page. + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + + return MatchOperand_Success; +} + +/// tryParseAdrLabel - Parse and validate a source label for the ADR +/// instruction. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const MCExpr *Expr; + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat hash token. + } + + if (getParser().parseExpression(Expr)) + return MatchOperand_ParseFail; + + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + + return MatchOperand_Success; +} + +/// tryParseFPImm - A floating point immediate expression operand. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + + bool Hash = false; + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat '#' + Hash = true; + } + + // Handle negation, as that still comes through as a separate token. + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); + } + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Real)) { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + if (isNegative) + RealVal.changeSign(); + + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + Parser.Lex(); // Eat the token. + // Check for out of range values. As an exception, we let Zero through, + // as we handle that special case in post-processing before matching in + // order to use the zero register for it. + if (Val == -1 && !RealVal.isPosZero()) { + TokError("expected compatible register or floating-point constant"); + return MatchOperand_ParseFail; + } + Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + if (Tok.is(AsmToken::Integer)) { + int64_t Val; + if (!isNegative && Tok.getString().startswith("0x")) { + Val = Tok.getIntVal(); + if (Val > 255 || Val < 0) { + TokError("encoded floating point value out of range"); + return MatchOperand_ParseFail; + } + } else { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + // If we had a '-' in front, toggle the sign bit. + IntVal ^= (uint64_t)isNegative << 63; + Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + } + Parser.Lex(); // Eat the token. + Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + + if (!Hash) + return MatchOperand_NoMatch; + + TokError("invalid floating point immediate"); + return MatchOperand_ParseFail; +} + +/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + + if (Parser.getTok().is(AsmToken::Hash)) + Parser.Lex(); // Eat '#' + else if (Parser.getTok().isNot(AsmToken::Integer)) + // Operand should start from # or should be integer, emit error otherwise. + return MatchOperand_NoMatch; + + const MCExpr *Imm; + if (parseSymbolicImmVal(Imm)) + return MatchOperand_ParseFail; + else if (Parser.getTok().isNot(AsmToken::Comma)) { + uint64_t ShiftAmount = 0; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm); + if (MCE) { + int64_t Val = MCE->getValue(); + if (Val > 0xfff && (Val & 0xfff) == 0) { + Imm = MCConstantExpr::create(Val >> 12, getContext()); + ShiftAmount = 12; + } + } + SMLoc E = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E, + getContext())); + return MatchOperand_Success; + } + + // Eat ',' + Parser.Lex(); + + // The optional operand must be "lsl #N" where N is non-negative. + if (!Parser.getTok().is(AsmToken::Identifier) || + !Parser.getTok().getIdentifier().equals_lower("lsl")) { + Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); + return MatchOperand_ParseFail; + } + + // Eat 'lsl' + Parser.Lex(); + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); + } + + if (Parser.getTok().isNot(AsmToken::Integer)) { + Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); + return MatchOperand_ParseFail; + } + + int64_t ShiftAmount = Parser.getTok().getIntVal(); + + if (ShiftAmount < 0) { + Error(Parser.getTok().getLoc(), "positive shift amount required"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat the number + + SMLoc E = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, + S, E, getContext())); + return MatchOperand_Success; +} + +/// parseCondCodeString - Parse a Condition Code string. +AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { + AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower()) + .Case("eq", AArch64CC::EQ) + .Case("ne", AArch64CC::NE) + .Case("cs", AArch64CC::HS) + .Case("hs", AArch64CC::HS) + .Case("cc", AArch64CC::LO) + .Case("lo", AArch64CC::LO) + .Case("mi", AArch64CC::MI) + .Case("pl", AArch64CC::PL) + .Case("vs", AArch64CC::VS) + .Case("vc", AArch64CC::VC) + .Case("hi", AArch64CC::HI) + .Case("ls", AArch64CC::LS) + .Case("ge", AArch64CC::GE) + .Case("lt", AArch64CC::LT) + .Case("gt", AArch64CC::GT) + .Case("le", AArch64CC::LE) + .Case("al", AArch64CC::AL) + .Case("nv", AArch64CC::NV) + .Default(AArch64CC::Invalid); + return CC; +} + +/// parseCondCode - Parse a Condition Code operand. +bool AArch64AsmParser::parseCondCode(OperandVector &Operands, + bool invertCondCode) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + StringRef Cond = Tok.getString(); + AArch64CC::CondCode CC = parseCondCodeString(Cond); + if (CC == AArch64CC::Invalid) + return TokError("invalid condition code"); + Parser.Lex(); // Eat identifier token. + + if (invertCondCode) { + if (CC == AArch64CC::AL || CC == AArch64CC::NV) + return TokError("condition codes AL and NV are invalid for this instruction"); + CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC)); + } + + Operands.push_back( + AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext())); + return false; +} + +/// tryParseOptionalShift - Some operands take an optional shift argument. Parse +/// them if present. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + std::string LowerID = Tok.getString().lower(); + AArch64_AM::ShiftExtendType ShOp = + StringSwitch<AArch64_AM::ShiftExtendType>(LowerID) + .Case("lsl", AArch64_AM::LSL) + .Case("lsr", AArch64_AM::LSR) + .Case("asr", AArch64_AM::ASR) + .Case("ror", AArch64_AM::ROR) + .Case("msl", AArch64_AM::MSL) + .Case("uxtb", AArch64_AM::UXTB) + .Case("uxth", AArch64_AM::UXTH) + .Case("uxtw", AArch64_AM::UXTW) + .Case("uxtx", AArch64_AM::UXTX) + .Case("sxtb", AArch64_AM::SXTB) + .Case("sxth", AArch64_AM::SXTH) + .Case("sxtw", AArch64_AM::SXTW) + .Case("sxtx", AArch64_AM::SXTX) + .Default(AArch64_AM::InvalidShiftExtend); + + if (ShOp == AArch64_AM::InvalidShiftExtend) + return MatchOperand_NoMatch; + + SMLoc S = Tok.getLoc(); + Parser.Lex(); + + bool Hash = getLexer().is(AsmToken::Hash); + if (!Hash && getLexer().isNot(AsmToken::Integer)) { + if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR || + ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR || + ShOp == AArch64_AM::MSL) { + // We expect a number here. + TokError("expected #imm after shift specifier"); + return MatchOperand_ParseFail; + } + + // "extend" type operatoins don't need an immediate, #0 is implicit. + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back( + AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext())); + return MatchOperand_Success; + } + + if (Hash) + Parser.Lex(); // Eat the '#'. + + // Make sure we do actually have a number or a parenthesized expression. + SMLoc E = Parser.getTok().getLoc(); + if (!Parser.getTok().is(AsmToken::Integer) && + !Parser.getTok().is(AsmToken::LParen)) { + Error(E, "expected integer shift amount"); + return MatchOperand_ParseFail; + } + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + Error(E, "expected constant '#imm' after shift specifier"); + return MatchOperand_ParseFail; + } + + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateShiftExtend( + ShOp, MCE->getValue(), true, S, E, getContext())); + return MatchOperand_Success; +} + +/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for +/// the SYS instruction. Parse them specially so that we create a SYS MCInst. +bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { + if (Name.find('.') != StringRef::npos) + return TokError("invalid operand"); + + Mnemonic = Name; + Operands.push_back( + AArch64Operand::CreateToken("sys", false, NameLoc, getContext())); + + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + StringRef Op = Tok.getString(); + SMLoc S = Tok.getLoc(); + + const MCExpr *Expr = nullptr; + +#define SYS_ALIAS(op1, Cn, Cm, op2) \ + do { \ + Expr = MCConstantExpr::create(op1, getContext()); \ + Operands.push_back( \ + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ + Operands.push_back( \ + AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \ + Operands.push_back( \ + AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \ + Expr = MCConstantExpr::create(op2, getContext()); \ + Operands.push_back( \ + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ + } while (0) + + if (Mnemonic == "ic") { + if (!Op.compare_lower("ialluis")) { + // SYS #0, C7, C1, #0 + SYS_ALIAS(0, 7, 1, 0); + } else if (!Op.compare_lower("iallu")) { + // SYS #0, C7, C5, #0 + SYS_ALIAS(0, 7, 5, 0); + } else if (!Op.compare_lower("ivau")) { + // SYS #3, C7, C5, #1 + SYS_ALIAS(3, 7, 5, 1); + } else { + return TokError("invalid operand for IC instruction"); + } + } else if (Mnemonic == "dc") { + if (!Op.compare_lower("zva")) { + // SYS #3, C7, C4, #1 + SYS_ALIAS(3, 7, 4, 1); + } else if (!Op.compare_lower("ivac")) { + // SYS #3, C7, C6, #1 + SYS_ALIAS(0, 7, 6, 1); + } else if (!Op.compare_lower("isw")) { + // SYS #0, C7, C6, #2 + SYS_ALIAS(0, 7, 6, 2); + } else if (!Op.compare_lower("cvac")) { + // SYS #3, C7, C10, #1 + SYS_ALIAS(3, 7, 10, 1); + } else if (!Op.compare_lower("csw")) { + // SYS #0, C7, C10, #2 + SYS_ALIAS(0, 7, 10, 2); + } else if (!Op.compare_lower("cvau")) { + // SYS #3, C7, C11, #1 + SYS_ALIAS(3, 7, 11, 1); + } else if (!Op.compare_lower("civac")) { + // SYS #3, C7, C14, #1 + SYS_ALIAS(3, 7, 14, 1); + } else if (!Op.compare_lower("cisw")) { + // SYS #0, C7, C14, #2 + SYS_ALIAS(0, 7, 14, 2); + } else if (!Op.compare_lower("cvap")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #3, C7, C12, #1 + SYS_ALIAS(3, 7, 12, 1); + } else { + return TokError("DC CVAP requires ARMv8.2a"); + } + } else { + return TokError("invalid operand for DC instruction"); + } + } else if (Mnemonic == "at") { + if (!Op.compare_lower("s1e1r")) { + // SYS #0, C7, C8, #0 + SYS_ALIAS(0, 7, 8, 0); + } else if (!Op.compare_lower("s1e2r")) { + // SYS #4, C7, C8, #0 + SYS_ALIAS(4, 7, 8, 0); + } else if (!Op.compare_lower("s1e3r")) { + // SYS #6, C7, C8, #0 + SYS_ALIAS(6, 7, 8, 0); + } else if (!Op.compare_lower("s1e1w")) { + // SYS #0, C7, C8, #1 + SYS_ALIAS(0, 7, 8, 1); + } else if (!Op.compare_lower("s1e2w")) { + // SYS #4, C7, C8, #1 + SYS_ALIAS(4, 7, 8, 1); + } else if (!Op.compare_lower("s1e3w")) { + // SYS #6, C7, C8, #1 + SYS_ALIAS(6, 7, 8, 1); + } else if (!Op.compare_lower("s1e0r")) { + // SYS #0, C7, C8, #3 + SYS_ALIAS(0, 7, 8, 2); + } else if (!Op.compare_lower("s1e0w")) { + // SYS #0, C7, C8, #3 + SYS_ALIAS(0, 7, 8, 3); + } else if (!Op.compare_lower("s12e1r")) { + // SYS #4, C7, C8, #4 + SYS_ALIAS(4, 7, 8, 4); + } else if (!Op.compare_lower("s12e1w")) { + // SYS #4, C7, C8, #5 + SYS_ALIAS(4, 7, 8, 5); + } else if (!Op.compare_lower("s12e0r")) { + // SYS #4, C7, C8, #6 + SYS_ALIAS(4, 7, 8, 6); + } else if (!Op.compare_lower("s12e0w")) { + // SYS #4, C7, C8, #7 + SYS_ALIAS(4, 7, 8, 7); + } else if (!Op.compare_lower("s1e1rp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #0 + SYS_ALIAS(0, 7, 9, 0); + } else { + return TokError("AT S1E1RP requires ARMv8.2a"); + } + } else if (!Op.compare_lower("s1e1wp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #1 + SYS_ALIAS(0, 7, 9, 1); + } else { + return TokError("AT S1E1WP requires ARMv8.2a"); + } + } else { + return TokError("invalid operand for AT instruction"); + } + } else if (Mnemonic == "tlbi") { + if (!Op.compare_lower("vmalle1is")) { + // SYS #0, C8, C3, #0 + SYS_ALIAS(0, 8, 3, 0); + } else if (!Op.compare_lower("alle2is")) { + // SYS #4, C8, C3, #0 + SYS_ALIAS(4, 8, 3, 0); + } else if (!Op.compare_lower("alle3is")) { + // SYS #6, C8, C3, #0 + SYS_ALIAS(6, 8, 3, 0); + } else if (!Op.compare_lower("vae1is")) { + // SYS #0, C8, C3, #1 + SYS_ALIAS(0, 8, 3, 1); + } else if (!Op.compare_lower("vae2is")) { + // SYS #4, C8, C3, #1 + SYS_ALIAS(4, 8, 3, 1); + } else if (!Op.compare_lower("vae3is")) { + // SYS #6, C8, C3, #1 + SYS_ALIAS(6, 8, 3, 1); + } else if (!Op.compare_lower("aside1is")) { + // SYS #0, C8, C3, #2 + SYS_ALIAS(0, 8, 3, 2); + } else if (!Op.compare_lower("vaae1is")) { + // SYS #0, C8, C3, #3 + SYS_ALIAS(0, 8, 3, 3); + } else if (!Op.compare_lower("alle1is")) { + // SYS #4, C8, C3, #4 + SYS_ALIAS(4, 8, 3, 4); + } else if (!Op.compare_lower("vale1is")) { + // SYS #0, C8, C3, #5 + SYS_ALIAS(0, 8, 3, 5); + } else if (!Op.compare_lower("vaale1is")) { + // SYS #0, C8, C3, #7 + SYS_ALIAS(0, 8, 3, 7); + } else if (!Op.compare_lower("vmalle1")) { + // SYS #0, C8, C7, #0 + SYS_ALIAS(0, 8, 7, 0); + } else if (!Op.compare_lower("alle2")) { + // SYS #4, C8, C7, #0 + SYS_ALIAS(4, 8, 7, 0); + } else if (!Op.compare_lower("vale2is")) { + // SYS #4, C8, C3, #5 + SYS_ALIAS(4, 8, 3, 5); + } else if (!Op.compare_lower("vale3is")) { + // SYS #6, C8, C3, #5 + SYS_ALIAS(6, 8, 3, 5); + } else if (!Op.compare_lower("alle3")) { + // SYS #6, C8, C7, #0 + SYS_ALIAS(6, 8, 7, 0); + } else if (!Op.compare_lower("vae1")) { + // SYS #0, C8, C7, #1 + SYS_ALIAS(0, 8, 7, 1); + } else if (!Op.compare_lower("vae2")) { + // SYS #4, C8, C7, #1 + SYS_ALIAS(4, 8, 7, 1); + } else if (!Op.compare_lower("vae3")) { + // SYS #6, C8, C7, #1 + SYS_ALIAS(6, 8, 7, 1); + } else if (!Op.compare_lower("aside1")) { + // SYS #0, C8, C7, #2 + SYS_ALIAS(0, 8, 7, 2); + } else if (!Op.compare_lower("vaae1")) { + // SYS #0, C8, C7, #3 + SYS_ALIAS(0, 8, 7, 3); + } else if (!Op.compare_lower("alle1")) { + // SYS #4, C8, C7, #4 + SYS_ALIAS(4, 8, 7, 4); + } else if (!Op.compare_lower("vale1")) { + // SYS #0, C8, C7, #5 + SYS_ALIAS(0, 8, 7, 5); + } else if (!Op.compare_lower("vale2")) { + // SYS #4, C8, C7, #5 + SYS_ALIAS(4, 8, 7, 5); + } else if (!Op.compare_lower("vale3")) { + // SYS #6, C8, C7, #5 + SYS_ALIAS(6, 8, 7, 5); + } else if (!Op.compare_lower("vaale1")) { + // SYS #0, C8, C7, #7 + SYS_ALIAS(0, 8, 7, 7); + } else if (!Op.compare_lower("ipas2e1")) { + // SYS #4, C8, C4, #1 + SYS_ALIAS(4, 8, 4, 1); + } else if (!Op.compare_lower("ipas2le1")) { + // SYS #4, C8, C4, #5 + SYS_ALIAS(4, 8, 4, 5); + } else if (!Op.compare_lower("ipas2e1is")) { + // SYS #4, C8, C4, #1 + SYS_ALIAS(4, 8, 0, 1); + } else if (!Op.compare_lower("ipas2le1is")) { + // SYS #4, C8, C4, #5 + SYS_ALIAS(4, 8, 0, 5); + } else if (!Op.compare_lower("vmalls12e1")) { + // SYS #4, C8, C7, #6 + SYS_ALIAS(4, 8, 7, 6); + } else if (!Op.compare_lower("vmalls12e1is")) { + // SYS #4, C8, C3, #6 + SYS_ALIAS(4, 8, 3, 6); + } else { + return TokError("invalid operand for TLBI instruction"); + } + } + +#undef SYS_ALIAS + + Parser.Lex(); // Eat operand. + + bool ExpectRegister = (Op.lower().find("all") == StringRef::npos); + bool HasRegister = false; + + // Check for the optional register operand. + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat comma. + + if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands)) + return TokError("expected register operand"); + + HasRegister = true; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + return TokError("unexpected token in argument list"); + } + + if (ExpectRegister && !HasRegister) { + return TokError("specified " + Mnemonic + " op requires a register"); + } + else if (!ExpectRegister && HasRegister) { + return TokError("specified " + Mnemonic + " op does not use a register"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + + // Can be either a #imm style literal or an option name + bool Hash = Tok.is(AsmToken::Hash); + if (Hash || Tok.is(AsmToken::Integer)) { + // Immediate operand. + if (Hash) + Parser.Lex(); // Eat the '#' + const MCExpr *ImmVal; + SMLoc ExprLoc = getLoc(); + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + Error(ExprLoc, "immediate value expected for barrier operand"); + return MatchOperand_ParseFail; + } + if (MCE->getValue() < 0 || MCE->getValue() > 15) { + Error(ExprLoc, "barrier operand out of range"); + return MatchOperand_ParseFail; + } + bool Valid; + auto Mapper = AArch64DB::DBarrierMapper(); + StringRef Name = + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); + Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, + ExprLoc, getContext())); + return MatchOperand_Success; + } + + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64DB::DBarrierMapper(); + unsigned Opt = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; + } + + // The only valid named option for ISB is 'sy' + if (Mnemonic == "isb" && Opt != AArch64DB::SY) { + TokError("'sy' or #imm operand expected"); + return MatchOperand_ParseFail; + } + + Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(), + getLoc(), getContext())); + Parser.Lex(); // Consume the option + + return MatchOperand_Success; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + bool IsKnown; + auto MRSMapper = AArch64SysReg::MRSMapper(); + uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); + assert(IsKnown == (MRSReg != -1U) && + "register should be -1 if and only if it's unknown"); + + auto MSRMapper = AArch64SysReg::MSRMapper(); + uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); + assert(IsKnown == (MSRReg != -1U) && + "register should be -1 if and only if it's unknown"); + + auto PStateMapper = AArch64PState::PStateMapper(); + uint32_t PStateField = + PStateMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); + assert(IsKnown == (PStateField != -1U) && + "register should be -1 if and only if it's unknown"); + + Operands.push_back(AArch64Operand::CreateSysReg( + Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext())); + Parser.Lex(); // Eat identifier + + return MatchOperand_Success; +} + +/// tryParseVectorRegister - Parse a vector register operand. +bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + if (Parser.getTok().isNot(AsmToken::Identifier)) + return true; + + SMLoc S = getLoc(); + // Check for a vector register specifier first. + StringRef Kind; + int64_t Reg = tryMatchVectorRegister(Kind, false); + if (Reg == -1) + return true; + Operands.push_back( + AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext())); + // If there was an explicit qualifier, that goes on as a literal text + // operand. + if (!Kind.empty()) + Operands.push_back( + AArch64Operand::CreateToken(Kind, false, S, getContext())); + + // If there is an index specifier following the register, parse that too. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return false; + } + + SMLoc E = getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return false; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx, + E, getContext())); + } + + return false; +} + +/// parseRegister - Parse a non-vector register operand. +bool AArch64AsmParser::parseRegister(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + // Try for a vector register. + if (!tryParseVectorRegister(Operands)) + return false; + + // Try for a scalar register. + int64_t Reg = tryParseRegister(); + if (Reg == -1) + return true; + Operands.push_back( + AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); + + // A small number of instructions (FMOVXDhighr, for example) have "[1]" + // as a string token in the instruction itself. + if (getLexer().getKind() == AsmToken::LBrac) { + SMLoc LBracS = getLoc(); + Parser.Lex(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Integer)) { + SMLoc IntS = getLoc(); + int64_t Val = Tok.getIntVal(); + if (Val == 1) { + Parser.Lex(); + if (getLexer().getKind() == AsmToken::RBrac) { + SMLoc RBracS = getLoc(); + Parser.Lex(); + Operands.push_back( + AArch64Operand::CreateToken("[", false, LBracS, getContext())); + Operands.push_back( + AArch64Operand::CreateToken("1", false, IntS, getContext())); + Operands.push_back( + AArch64Operand::CreateToken("]", false, RBracS, getContext())); + return false; + } + } + } + } + + return false; +} + +bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { + MCAsmParser &Parser = getParser(); + bool HasELFModifier = false; + AArch64MCExpr::VariantKind RefKind; + + if (Parser.getTok().is(AsmToken::Colon)) { + Parser.Lex(); // Eat ':" + HasELFModifier = true; + + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), + "expect relocation specifier in operand after ':'"); + return true; + } + + std::string LowerCase = Parser.getTok().getIdentifier().lower(); + RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase) + .Case("lo12", AArch64MCExpr::VK_LO12) + .Case("abs_g3", AArch64MCExpr::VK_ABS_G3) + .Case("abs_g2", AArch64MCExpr::VK_ABS_G2) + .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S) + .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC) + .Case("abs_g1", AArch64MCExpr::VK_ABS_G1) + .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S) + .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC) + .Case("abs_g0", AArch64MCExpr::VK_ABS_G0) + .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S) + .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC) + .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2) + .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1) + .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC) + .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0) + .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC) + .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12) + .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12) + .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC) + .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2) + .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1) + .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC) + .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0) + .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC) + .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12) + .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12) + .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC) + .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12) + .Case("got", AArch64MCExpr::VK_GOT_PAGE) + .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12) + .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE) + .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC) + .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1) + .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC) + .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE) + .Default(AArch64MCExpr::VK_INVALID); + + if (RefKind == AArch64MCExpr::VK_INVALID) { + Error(Parser.getTok().getLoc(), + "expect relocation specifier in operand after ':'"); + return true; + } + + Parser.Lex(); // Eat identifier + + if (Parser.getTok().isNot(AsmToken::Colon)) { + Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier"); + return true; + } + Parser.Lex(); // Eat ':' + } + + if (getParser().parseExpression(ImmVal)) + return true; + + if (HasELFModifier) + ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext()); + + return false; +} + +/// parseVectorList - Parse a vector list operand for AdvSIMD instructions. +bool AArch64AsmParser::parseVectorList(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket"); + SMLoc S = getLoc(); + Parser.Lex(); // Eat left bracket token. + StringRef Kind; + int64_t FirstReg = tryMatchVectorRegister(Kind, true); + if (FirstReg == -1) + return true; + int64_t PrevReg = FirstReg; + unsigned Count = 1; + + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + + SMLoc Loc = getLoc(); + StringRef NextKind; + int64_t Reg = tryMatchVectorRegister(NextKind, true); + if (Reg == -1) + return true; + // Any Kind suffices must match on all regs in the list. + if (Kind != NextKind) + return Error(Loc, "mismatched register size suffix"); + + unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg); + + if (Space == 0 || Space > 3) { + return Error(Loc, "invalid number of vectors"); + } + + Count += Space; + } + else { + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma token. + + SMLoc Loc = getLoc(); + StringRef NextKind; + int64_t Reg = tryMatchVectorRegister(NextKind, true); + if (Reg == -1) + return true; + // Any Kind suffices must match on all regs in the list. + if (Kind != NextKind) + return Error(Loc, "mismatched register size suffix"); + + // Registers must be incremental (with wraparound at 31) + if (getContext().getRegisterInfo()->getEncodingValue(Reg) != + (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) + return Error(Loc, "registers must be sequential"); + + PrevReg = Reg; + ++Count; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) + return Error(getLoc(), "'}' expected"); + Parser.Lex(); // Eat the '}' token. + + if (Count > 4) + return Error(S, "invalid number of vectors"); + + unsigned NumElements = 0; + char ElementKind = 0; + if (!Kind.empty()) + parseValidVectorKind(Kind, NumElements, ElementKind); + + Operands.push_back(AArch64Operand::CreateVectorList( + FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext())); + + // If there is an index specifier following the list, parse that too. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return false; + } + + SMLoc E = getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return false; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx, + E, getContext())); + } + return false; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + unsigned RegNum = matchRegisterNameAlias(Tok.getString().lower(), false); + + MCContext &Ctx = getContext(); + const MCRegisterInfo *RI = Ctx.getRegisterInfo(); + if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum)) + return MatchOperand_NoMatch; + + SMLoc S = getLoc(); + Parser.Lex(); // Eat register + + if (Parser.getTok().isNot(AsmToken::Comma)) { + Operands.push_back( + AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + return MatchOperand_Success; + } + Parser.Lex(); // Eat comma. + + if (Parser.getTok().is(AsmToken::Hash)) + Parser.Lex(); // Eat hash + + if (Parser.getTok().isNot(AsmToken::Integer)) { + Error(getLoc(), "index must be absent or #0"); + return MatchOperand_ParseFail; + } + + const MCExpr *ImmVal; + if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) || + cast<MCConstantExpr>(ImmVal)->getValue() != 0) { + Error(getLoc(), "index must be absent or #0"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + return MatchOperand_Success; +} + +/// parseOperand - Parse a arm instruction operand. For now this parses the +/// operand regardless of the mnemonic. +bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, + bool invertCondCode) { + MCAsmParser &Parser = getParser(); + // Check if the current operand has a custom associated parser, if so, try to + // custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + + // Nothing custom, so do general case parsing. + SMLoc S, E; + switch (getLexer().getKind()) { + default: { + SMLoc S = getLoc(); + const MCExpr *Expr; + if (parseSymbolicImmVal(Expr)) + return Error(S, "invalid operand"); + + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + return false; + } + case AsmToken::LBrac: { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("[", false, Loc, + getContext())); + Parser.Lex(); // Eat '[' + + // There's no comma after a '[', so we can parse the next operand + // immediately. + return parseOperand(Operands, false, false); + } + case AsmToken::LCurly: + return parseVectorList(Operands); + case AsmToken::Identifier: { + // If we're expecting a Condition Code operand, then just parse that. + if (isCondCode) + return parseCondCode(Operands, invertCondCode); + + // If it's a register name, parse it. + if (!parseRegister(Operands)) + return false; + + // This could be an optional "shift" or "extend" operand. + OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands); + // We can only continue if no tokens were eaten. + if (GotShift != MatchOperand_NoMatch) + return GotShift; + + // This was not a register so parse other operands that start with an + // identifier (like labels) as expressions and create them as immediates. + const MCExpr *IdVal; + S = getLoc(); + if (getParser().parseExpression(IdVal)) + return true; + + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext())); + return false; + } + case AsmToken::Integer: + case AsmToken::Real: + case AsmToken::Hash: { + // #42 -> immediate. + S = getLoc(); + if (getLexer().is(AsmToken::Hash)) + Parser.Lex(); + + // Parse a negative sign + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + // We need to consume this token only when we have a Real, otherwise + // we let parseSymbolicImmVal take care of it + if (Parser.getLexer().peekTok().is(AsmToken::Real)) + Parser.Lex(); + } + + // The only Real that should come through here is a literal #0.0 for + // the fcmp[e] r, #0.0 instructions. They expect raw token operands, + // so convert the value. + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Real)) { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" && + Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" && + Mnemonic != "fcmlt") + return TokError("unexpected floating point literal"); + else if (IntVal != 0 || isNegative) + return TokError("expected floating-point constant #0.0"); + Parser.Lex(); // Eat the token. + + Operands.push_back( + AArch64Operand::CreateToken("#0", false, S, getContext())); + Operands.push_back( + AArch64Operand::CreateToken(".0", false, S, getContext())); + return false; + } + + const MCExpr *ImmVal; + if (parseSymbolicImmVal(ImmVal)) + return true; + + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext())); + return false; + } + case AsmToken::Equal: { + SMLoc Loc = Parser.getTok().getLoc(); + if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) + return Error(Loc, "unexpected token in operand"); + Parser.Lex(); // Eat '=' + const MCExpr *SubExprVal; + if (getParser().parseExpression(SubExprVal)) + return true; + + if (Operands.size() < 2 || + !static_cast<AArch64Operand &>(*Operands[1]).isReg()) + return Error(Loc, "Only valid when first operand is register"); + + bool IsXReg = + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Operands[1]->getReg()); + + MCContext& Ctx = getContext(); + E = SMLoc::getFromPointer(Loc.getPointer() - 1); + // If the op is an imm and can be fit into a mov, then replace ldr with mov. + if (isa<MCConstantExpr>(SubExprVal)) { + uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue(); + uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16; + while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) { + ShiftAmt += 16; + Imm >>= 16; + } + if (ShiftAmt <= MaxShiftAmt && Imm <= 0xFFFF) { + Operands[0] = AArch64Operand::CreateToken("movz", false, Loc, Ctx); + Operands.push_back(AArch64Operand::CreateImm( + MCConstantExpr::create(Imm, Ctx), S, E, Ctx)); + if (ShiftAmt) + Operands.push_back(AArch64Operand::CreateShiftExtend(AArch64_AM::LSL, + ShiftAmt, true, S, E, Ctx)); + return false; + } + APInt Simm = APInt(64, Imm << ShiftAmt); + // check if the immediate is an unsigned or signed 32-bit int for W regs + if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32))) + return Error(Loc, "Immediate too large for register"); + } + // If it is a label or an imm that cannot fit in a movz, put it into CP. + const MCExpr *CPLoc = + getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc); + Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); + return false; + } + } +} + +/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its +/// operands. +bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + Name = StringSwitch<StringRef>(Name.lower()) + .Case("beq", "b.eq") + .Case("bne", "b.ne") + .Case("bhs", "b.hs") + .Case("bcs", "b.cs") + .Case("blo", "b.lo") + .Case("bcc", "b.cc") + .Case("bmi", "b.mi") + .Case("bpl", "b.pl") + .Case("bvs", "b.vs") + .Case("bvc", "b.vc") + .Case("bhi", "b.hi") + .Case("bls", "b.ls") + .Case("bge", "b.ge") + .Case("blt", "b.lt") + .Case("bgt", "b.gt") + .Case("ble", "b.le") + .Case("bal", "b.al") + .Case("bnv", "b.nv") + .Default(Name); + + // First check for the AArch64-specific .req directive. + if (Parser.getTok().is(AsmToken::Identifier) && + Parser.getTok().getIdentifier() == ".req") { + parseDirectiveReq(Name, NameLoc); + // We always return 'error' for this, as we're done with this + // statement and don't need to match the 'instruction." + return true; + } + + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + + // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction. + if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") { + bool IsError = parseSysAlias(Head, NameLoc, Operands); + if (IsError && getLexer().isNot(AsmToken::EndOfStatement)) + Parser.eatToEndOfStatement(); + return IsError; + } + + Operands.push_back( + AArch64Operand::CreateToken(Head, false, NameLoc, getContext())); + Mnemonic = Head; + + // Handle condition codes for a branch mnemonic + if (Head == "b" && Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + Head = Name.slice(Start + 1, Next); + + SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + (Head.data() - Name.data())); + AArch64CC::CondCode CC = parseCondCodeString(Head); + if (CC == AArch64CC::Invalid) + return Error(SuffixLoc, "invalid condition code"); + Operands.push_back( + AArch64Operand::CreateToken(".", true, SuffixLoc, getContext())); + Operands.push_back( + AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext())); + } + + // Add the remaining tokens in the mnemonic. + while (Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + Head = Name.slice(Start, Next); + SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + (Head.data() - Name.data()) + 1); + Operands.push_back( + AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext())); + } + + // Conditional compare instructions have a Condition Code operand, which needs + // to be parsed and an immediate operand created. + bool condCodeFourthOperand = + (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" || + Head == "fccmpe" || Head == "fcsel" || Head == "csel" || + Head == "csinc" || Head == "csinv" || Head == "csneg"); + + // These instructions are aliases to some of the conditional select + // instructions. However, the condition code is inverted in the aliased + // instruction. + // + // FIXME: Is this the correct way to handle these? Or should the parser + // generate the aliased instructions directly? + bool condCodeSecondOperand = (Head == "cset" || Head == "csetm"); + bool condCodeThirdOperand = + (Head == "cinc" || Head == "cinv" || Head == "cneg"); + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (parseOperand(Operands, false, false)) { + Parser.eatToEndOfStatement(); + return true; + } + + unsigned N = 2; + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) || + (N == 3 && condCodeThirdOperand) || + (N == 2 && condCodeSecondOperand), + condCodeSecondOperand || condCodeThirdOperand)) { + Parser.eatToEndOfStatement(); + return true; + } + + // After successfully parsing some operands there are two special cases to + // consider (i.e. notional operands not separated by commas). Both are due + // to memory specifiers: + // + An RBrac will end an address for load/store/prefetch + // + An '!' will indicate a pre-indexed operation. + // + // It's someone else's responsibility to make sure these tokens are sane + // in the given context! + if (Parser.getTok().is(AsmToken::RBrac)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("]", false, Loc, + getContext())); + Parser.Lex(); + } + + if (Parser.getTok().is(AsmToken::Exclaim)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("!", false, Loc, + getContext())); + Parser.Lex(); + } + + ++N; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = Parser.getTok().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; +} + +// FIXME: This entire function is a giant hack to provide us with decent +// operand range validation/diagnostics until TableGen/MC can be extended +// to support autogeneration of this kind of validation. +bool AArch64AsmParser::validateInstruction(MCInst &Inst, + SmallVectorImpl<SMLoc> &Loc) { + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + // Check for indexed addressing modes w/ the base register being the + // same as a destination/source register or pair load where + // the Rt == Rt2. All of those are undefined behaviour. + switch (Inst.getOpcode()) { + case AArch64::LDPSWpre: + case AArch64::LDPWpost: + case AArch64::LDPWpre: + case AArch64::LDPXpost: + case AArch64::LDPXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + unsigned Rn = Inst.getOperand(3).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable LDP instruction, writeback base " + "is also a destination"); + if (RI->isSubRegisterEq(Rn, Rt2)) + return Error(Loc[1], "unpredictable LDP instruction, writeback base " + "is also a destination"); + // FALLTHROUGH + } + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::LDPXi: { + unsigned Rt = Inst.getOperand(0).getReg(); + unsigned Rt2 = Inst.getOperand(1).getReg(); + if (Rt == Rt2) + return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); + break; + } + case AArch64::LDPDpost: + case AArch64::LDPDpre: + case AArch64::LDPQpost: + case AArch64::LDPQpre: + case AArch64::LDPSpost: + case AArch64::LDPSpre: + case AArch64::LDPSWpost: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + if (Rt == Rt2) + return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); + break; + } + case AArch64::STPDpost: + case AArch64::STPDpre: + case AArch64::STPQpost: + case AArch64::STPQpre: + case AArch64::STPSpost: + case AArch64::STPSpre: + case AArch64::STPWpost: + case AArch64::STPWpre: + case AArch64::STPXpost: + case AArch64::STPXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + unsigned Rn = Inst.getOperand(3).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable STP instruction, writeback base " + "is also a source"); + if (RI->isSubRegisterEq(Rn, Rt2)) + return Error(Loc[1], "unpredictable STP instruction, writeback base " + "is also a source"); + break; + } + case AArch64::LDRBBpre: + case AArch64::LDRBpre: + case AArch64::LDRHHpre: + case AArch64::LDRHpre: + case AArch64::LDRSBWpre: + case AArch64::LDRSBXpre: + case AArch64::LDRSHWpre: + case AArch64::LDRSHXpre: + case AArch64::LDRSWpre: + case AArch64::LDRWpre: + case AArch64::LDRXpre: + case AArch64::LDRBBpost: + case AArch64::LDRBpost: + case AArch64::LDRHHpost: + case AArch64::LDRHpost: + case AArch64::LDRSBWpost: + case AArch64::LDRSBXpost: + case AArch64::LDRSHWpost: + case AArch64::LDRSHXpost: + case AArch64::LDRSWpost: + case AArch64::LDRWpost: + case AArch64::LDRXpost: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rn = Inst.getOperand(2).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable LDR instruction, writeback base " + "is also a source"); + break; + } + case AArch64::STRBBpost: + case AArch64::STRBpost: + case AArch64::STRHHpost: + case AArch64::STRHpost: + case AArch64::STRWpost: + case AArch64::STRXpost: + case AArch64::STRBBpre: + case AArch64::STRBpre: + case AArch64::STRHHpre: + case AArch64::STRHpre: + case AArch64::STRWpre: + case AArch64::STRXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rn = Inst.getOperand(2).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable STR instruction, writeback base " + "is also a source"); + break; + } + } + + // Now check immediate ranges. Separate from the above as there is overlap + // in the instructions being checked and this keeps the nested conditionals + // to a minimum. + switch (Inst.getOpcode()) { + case AArch64::ADDSWri: + case AArch64::ADDSXri: + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBSWri: + case AArch64::SUBSXri: + case AArch64::SUBWri: + case AArch64::SUBXri: { + // Annoyingly we can't do this in the isAddSubImm predicate, so there is + // some slight duplication here. + if (Inst.getOperand(2).isExpr()) { + const MCExpr *Expr = Inst.getOperand(2).getExpr(); + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { + return Error(Loc[2], "invalid immediate expression"); + } + + // Only allow these with ADDXri. + if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) && + Inst.getOpcode() == AArch64::ADDXri) + return false; + + // Only allow these with ADDXri/ADDWri + if ((ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) && + (Inst.getOpcode() == AArch64::ADDXri || + Inst.getOpcode() == AArch64::ADDWri)) + return false; + + // Don't allow expressions in the immediate field otherwise + return Error(Loc[2], "invalid immediate expression"); + } + return false; + } + default: + return false; + } +} + +bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { + switch (ErrCode) { + case Match_MissingFeature: + return Error(Loc, + "instruction requires a CPU feature not currently enabled"); + case Match_InvalidOperand: + return Error(Loc, "invalid operand for instruction"); + case Match_InvalidSuffix: + return Error(Loc, "invalid type suffix for instruction"); + case Match_InvalidCondCode: + return Error(Loc, "expected AArch64 condition code"); + case Match_AddSubRegExtendSmall: + return Error(Loc, + "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]"); + case Match_AddSubRegExtendLarge: + return Error(Loc, + "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]"); + case Match_AddSubSecondSource: + return Error(Loc, + "expected compatible register, symbol or integer in range [0, 4095]"); + case Match_LogicalSecondSource: + return Error(Loc, "expected compatible register or logical immediate"); + case Match_InvalidMovImm32Shift: + return Error(Loc, "expected 'lsl' with optional integer 0 or 16"); + case Match_InvalidMovImm64Shift: + return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48"); + case Match_AddSubRegShift32: + return Error(Loc, + "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]"); + case Match_AddSubRegShift64: + return Error(Loc, + "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]"); + case Match_InvalidFPImm: + return Error(Loc, + "expected compatible register or floating-point constant"); + case Match_InvalidMemoryIndexedSImm9: + return Error(Loc, "index must be an integer in range [-256, 255]."); + case Match_InvalidMemoryIndexed4SImm7: + return Error(Loc, "index must be a multiple of 4 in range [-256, 252]."); + case Match_InvalidMemoryIndexed8SImm7: + return Error(Loc, "index must be a multiple of 8 in range [-512, 504]."); + case Match_InvalidMemoryIndexed16SImm7: + return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008]."); + case Match_InvalidMemoryWExtend8: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0"); + case Match_InvalidMemoryWExtend16: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1"); + case Match_InvalidMemoryWExtend32: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2"); + case Match_InvalidMemoryWExtend64: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3"); + case Match_InvalidMemoryWExtend128: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4"); + case Match_InvalidMemoryXExtend8: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0"); + case Match_InvalidMemoryXExtend16: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #1"); + case Match_InvalidMemoryXExtend32: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #2"); + case Match_InvalidMemoryXExtend64: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #3"); + case Match_InvalidMemoryXExtend128: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #4"); + case Match_InvalidMemoryIndexed1: + return Error(Loc, "index must be an integer in range [0, 4095]."); + case Match_InvalidMemoryIndexed2: + return Error(Loc, "index must be a multiple of 2 in range [0, 8190]."); + case Match_InvalidMemoryIndexed4: + return Error(Loc, "index must be a multiple of 4 in range [0, 16380]."); + case Match_InvalidMemoryIndexed8: + return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); + case Match_InvalidMemoryIndexed16: + return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); + case Match_InvalidImm0_1: + return Error(Loc, "immediate must be an integer in range [0, 1]."); + case Match_InvalidImm0_7: + return Error(Loc, "immediate must be an integer in range [0, 7]."); + case Match_InvalidImm0_15: + return Error(Loc, "immediate must be an integer in range [0, 15]."); + case Match_InvalidImm0_31: + return Error(Loc, "immediate must be an integer in range [0, 31]."); + case Match_InvalidImm0_63: + return Error(Loc, "immediate must be an integer in range [0, 63]."); + case Match_InvalidImm0_127: + return Error(Loc, "immediate must be an integer in range [0, 127]."); + case Match_InvalidImm0_65535: + return Error(Loc, "immediate must be an integer in range [0, 65535]."); + case Match_InvalidImm1_8: + return Error(Loc, "immediate must be an integer in range [1, 8]."); + case Match_InvalidImm1_16: + return Error(Loc, "immediate must be an integer in range [1, 16]."); + case Match_InvalidImm1_32: + return Error(Loc, "immediate must be an integer in range [1, 32]."); + case Match_InvalidImm1_64: + return Error(Loc, "immediate must be an integer in range [1, 64]."); + case Match_InvalidIndex1: + return Error(Loc, "expected lane specifier '[1]'"); + case Match_InvalidIndexB: + return Error(Loc, "vector lane must be an integer in range [0, 15]."); + case Match_InvalidIndexH: + return Error(Loc, "vector lane must be an integer in range [0, 7]."); + case Match_InvalidIndexS: + return Error(Loc, "vector lane must be an integer in range [0, 3]."); + case Match_InvalidIndexD: + return Error(Loc, "vector lane must be an integer in range [0, 1]."); + case Match_InvalidLabel: + return Error(Loc, "expected label or encodable integer pc offset"); + case Match_MRS: + return Error(Loc, "expected readable system register"); + case Match_MSR: + return Error(Loc, "expected writable system register or pstate"); + case Match_MnemonicFail: + return Error(Loc, "unrecognized instruction mnemonic"); + default: + llvm_unreachable("unexpected error code!"); + } +} + +static const char *getSubtargetFeatureName(uint64_t Val); + +bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + + StringRef Tok = Op.getToken(); + unsigned NumOperands = Operands.size(); + + if (NumOperands == 4 && Tok == "lsl") { + AArch64Operand &Op2 = static_cast<AArch64Operand &>(*Operands[2]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + if (Op2.isReg() && Op3.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); + if (Op3CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t NewOp3Val = 0; + uint64_t NewOp4Val = 0; + if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains( + Op2.getReg())) { + NewOp3Val = (32 - Op3Val) & 0x1f; + NewOp4Val = 31 - Op3Val; + } else { + NewOp3Val = (64 - Op3Val) & 0x3f; + NewOp4Val = 63 - Op3Val; + } + + const MCExpr *NewOp3 = MCConstantExpr::create(NewOp3Val, getContext()); + const MCExpr *NewOp4 = MCConstantExpr::create(NewOp4Val, getContext()); + + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op.getStartLoc(), getContext()); + Operands.push_back(AArch64Operand::CreateImm( + NewOp4, Op3.getStartLoc(), Op3.getEndLoc(), getContext())); + Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3.getStartLoc(), + Op3.getEndLoc(), getContext()); + } + } + } else if (NumOperands == 4 && Tok == "bfc") { + // FIXME: Horrible hack to handle BFC->BFM alias. + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand LSBOp = static_cast<AArch64Operand &>(*Operands[2]); + AArch64Operand WidthOp = static_cast<AArch64Operand &>(*Operands[3]); + + if (Op1.isReg() && LSBOp.isImm() && WidthOp.isImm()) { + const MCConstantExpr *LSBCE = dyn_cast<MCConstantExpr>(LSBOp.getImm()); + const MCConstantExpr *WidthCE = dyn_cast<MCConstantExpr>(WidthOp.getImm()); + + if (LSBCE && WidthCE) { + uint64_t LSB = LSBCE->getValue(); + uint64_t Width = WidthCE->getValue(); + + uint64_t RegWidth = 0; + if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op1.getReg())) + RegWidth = 64; + else + RegWidth = 32; + + if (LSB >= RegWidth) + return Error(LSBOp.getStartLoc(), + "expected integer in range [0, 31]"); + if (Width < 1 || Width > RegWidth) + return Error(WidthOp.getStartLoc(), + "expected integer in range [1, 32]"); + + uint64_t ImmR = 0; + if (RegWidth == 32) + ImmR = (32 - LSB) & 0x1f; + else + ImmR = (64 - LSB) & 0x3f; + + uint64_t ImmS = Width - 1; + + if (ImmR != 0 && ImmS >= ImmR) + return Error(WidthOp.getStartLoc(), + "requested insert overflows register"); + + const MCExpr *ImmRExpr = MCConstantExpr::create(ImmR, getContext()); + const MCExpr *ImmSExpr = MCConstantExpr::create(ImmS, getContext()); + Operands[0] = AArch64Operand::CreateToken( + "bfm", false, Op.getStartLoc(), getContext()); + Operands[2] = AArch64Operand::CreateReg( + RegWidth == 32 ? AArch64::WZR : AArch64::XZR, false, SMLoc(), + SMLoc(), getContext()); + Operands[3] = AArch64Operand::CreateImm( + ImmRExpr, LSBOp.getStartLoc(), LSBOp.getEndLoc(), getContext()); + Operands.emplace_back( + AArch64Operand::CreateImm(ImmSExpr, WidthOp.getStartLoc(), + WidthOp.getEndLoc(), getContext())); + } + } + } else if (NumOperands == 5) { + // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and + // UBFIZ -> UBFM aliases. + if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") { + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]); + + if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); + const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm()); + + if (Op3CE && Op4CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t Op4Val = Op4CE->getValue(); + + uint64_t RegWidth = 0; + if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op1.getReg())) + RegWidth = 64; + else + RegWidth = 32; + + if (Op3Val >= RegWidth) + return Error(Op3.getStartLoc(), + "expected integer in range [0, 31]"); + if (Op4Val < 1 || Op4Val > RegWidth) + return Error(Op4.getStartLoc(), + "expected integer in range [1, 32]"); + + uint64_t NewOp3Val = 0; + if (RegWidth == 32) + NewOp3Val = (32 - Op3Val) & 0x1f; + else + NewOp3Val = (64 - Op3Val) & 0x3f; + + uint64_t NewOp4Val = Op4Val - 1; + + if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val) + return Error(Op4.getStartLoc(), + "requested insert overflows register"); + + const MCExpr *NewOp3 = + MCConstantExpr::create(NewOp3Val, getContext()); + const MCExpr *NewOp4 = + MCConstantExpr::create(NewOp4Val, getContext()); + Operands[3] = AArch64Operand::CreateImm( + NewOp3, Op3.getStartLoc(), Op3.getEndLoc(), getContext()); + Operands[4] = AArch64Operand::CreateImm( + NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext()); + if (Tok == "bfi") + Operands[0] = AArch64Operand::CreateToken( + "bfm", false, Op.getStartLoc(), getContext()); + else if (Tok == "sbfiz") + Operands[0] = AArch64Operand::CreateToken( + "sbfm", false, Op.getStartLoc(), getContext()); + else if (Tok == "ubfiz") + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op.getStartLoc(), getContext()); + else + llvm_unreachable("No valid mnemonic for alias?"); + } + } + + // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and + // UBFX -> UBFM aliases. + } else if (NumOperands == 5 && + (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) { + AArch64Operand &Op1 = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &Op3 = static_cast<AArch64Operand &>(*Operands[3]); + AArch64Operand &Op4 = static_cast<AArch64Operand &>(*Operands[4]); + + if (Op1.isReg() && Op3.isImm() && Op4.isImm()) { + const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3.getImm()); + const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4.getImm()); + + if (Op3CE && Op4CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t Op4Val = Op4CE->getValue(); + + uint64_t RegWidth = 0; + if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op1.getReg())) + RegWidth = 64; + else + RegWidth = 32; + + if (Op3Val >= RegWidth) + return Error(Op3.getStartLoc(), + "expected integer in range [0, 31]"); + if (Op4Val < 1 || Op4Val > RegWidth) + return Error(Op4.getStartLoc(), + "expected integer in range [1, 32]"); + + uint64_t NewOp4Val = Op3Val + Op4Val - 1; + + if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val) + return Error(Op4.getStartLoc(), + "requested extract overflows register"); + + const MCExpr *NewOp4 = + MCConstantExpr::create(NewOp4Val, getContext()); + Operands[4] = AArch64Operand::CreateImm( + NewOp4, Op4.getStartLoc(), Op4.getEndLoc(), getContext()); + if (Tok == "bfxil") + Operands[0] = AArch64Operand::CreateToken( + "bfm", false, Op.getStartLoc(), getContext()); + else if (Tok == "sbfx") + Operands[0] = AArch64Operand::CreateToken( + "sbfm", false, Op.getStartLoc(), getContext()); + else if (Tok == "ubfx") + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op.getStartLoc(), getContext()); + else + llvm_unreachable("No valid mnemonic for alias?"); + } + } + } + } + // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands. + // InstAlias can't quite handle this since the reg classes aren't + // subclasses. + if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) { + // The source register can be Wn here, but the matcher expects a + // GPR64. Twiddle it here if necessary. + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); + if (Op.isReg()) { + unsigned Reg = getXRegFromWReg(Op.getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); + } + } + // FIXME: Likewise for sxt[bh] with a Xd dst operand + else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) { + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg() && + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op.getReg())) { + // The source register can be Wn here, but the matcher expects a + // GPR64. Twiddle it here if necessary. + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[2]); + if (Op.isReg()) { + unsigned Reg = getXRegFromWReg(Op.getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); + } + } + } + // FIXME: Likewise for uxt[bh] with a Xd dst operand + else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) { + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg() && + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op.getReg())) { + // The source register can be Wn here, but the matcher expects a + // GPR32. Twiddle it here if necessary. + AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[1]); + if (Op.isReg()) { + unsigned Reg = getWRegFromXReg(Op.getReg()); + Operands[1] = AArch64Operand::CreateReg(Reg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); + } + } + } + + // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. + if (NumOperands == 3 && Tok == "fmov") { + AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]); + AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); + if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { + unsigned zreg = + !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( + RegOp.getReg()) + ? AArch64::WZR + : AArch64::XZR; + Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(), + Op.getEndLoc(), getContext()); + } + } + + MCInst Inst; + // First try to match against the secondary set of tables containing the + // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). + unsigned MatchResult = + MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1); + + // If that fails, try against the alternate table containing long-form NEON: + // "fadd v0.2s, v1.2s, v2.2s" + if (MatchResult != Match_Success) { + // But first, save the short-form match result: we can use it in case the + // long-form match also fails. + auto ShortFormNEONErrorInfo = ErrorInfo; + auto ShortFormNEONMatchResult = MatchResult; + + MatchResult = + MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + + // Now, both matches failed, and the long-form match failed on the mnemonic + // suffix token operand. The short-form match failure is probably more + // relevant: use it instead. + if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 && + Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() && + ((AArch64Operand &)*Operands[1]).isTokenSuffix()) { + MatchResult = ShortFormNEONMatchResult; + ErrorInfo = ShortFormNEONErrorInfo; + } + } + + + switch (MatchResult) { + case Match_Success: { + // Perform range checking and other semantic validations + SmallVector<SMLoc, 8> OperandLocs; + NumOperands = Operands.size(); + for (unsigned i = 1; i < NumOperands; ++i) + OperandLocs.push_back(Operands[i]->getStartLoc()); + if (validateInstruction(Inst, OperandLocs)) + return true; + + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, getSTI()); + return false; + } + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing (neon, e.g.). + std::string Msg = "instruction requires:"; + uint64_t Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg); + } + case Match_MnemonicFail: + return showMatchError(IDLoc, MatchResult); + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + // If the match failed on a suffix token operand, tweak the diagnostic + // accordingly. + if (((AArch64Operand &)*Operands[ErrorInfo]).isToken() && + ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix()) + MatchResult = Match_InvalidSuffix; + + return showMatchError(ErrorLoc, MatchResult); + } + case Match_InvalidMemoryIndexed1: + case Match_InvalidMemoryIndexed2: + case Match_InvalidMemoryIndexed4: + case Match_InvalidMemoryIndexed8: + case Match_InvalidMemoryIndexed16: + case Match_InvalidCondCode: + case Match_AddSubRegExtendSmall: + case Match_AddSubRegExtendLarge: + case Match_AddSubSecondSource: + case Match_LogicalSecondSource: + case Match_AddSubRegShift32: + case Match_AddSubRegShift64: + case Match_InvalidMovImm32Shift: + case Match_InvalidMovImm64Shift: + case Match_InvalidFPImm: + case Match_InvalidMemoryWExtend8: + case Match_InvalidMemoryWExtend16: + case Match_InvalidMemoryWExtend32: + case Match_InvalidMemoryWExtend64: + case Match_InvalidMemoryWExtend128: + case Match_InvalidMemoryXExtend8: + case Match_InvalidMemoryXExtend16: + case Match_InvalidMemoryXExtend32: + case Match_InvalidMemoryXExtend64: + case Match_InvalidMemoryXExtend128: + case Match_InvalidMemoryIndexed4SImm7: + case Match_InvalidMemoryIndexed8SImm7: + case Match_InvalidMemoryIndexed16SImm7: + case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidImm0_1: + case Match_InvalidImm0_7: + case Match_InvalidImm0_15: + case Match_InvalidImm0_31: + case Match_InvalidImm0_63: + case Match_InvalidImm0_127: + case Match_InvalidImm0_65535: + case Match_InvalidImm1_8: + case Match_InvalidImm1_16: + case Match_InvalidImm1_32: + case Match_InvalidImm1_64: + case Match_InvalidIndex1: + case Match_InvalidIndexB: + case Match_InvalidIndexH: + case Match_InvalidIndexS: + case Match_InvalidIndexD: + case Match_InvalidLabel: + case Match_MSR: + case Match_MRS: { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + // Any time we get here, there's nothing fancy to do. Just get the + // operand SMLoc and display the diagnostic. + SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return showMatchError(ErrorLoc, MatchResult); + } + } + + llvm_unreachable("Implement any new match types added!"); +} + +/// ParseDirective parses the arm specific directives +bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { + const MCObjectFileInfo::Environment Format = + getContext().getObjectFileInfo()->getObjectFileType(); + bool IsMachO = Format == MCObjectFileInfo::IsMachO; + bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; + + StringRef IDVal = DirectiveID.getIdentifier(); + SMLoc Loc = DirectiveID.getLoc(); + if (IDVal == ".hword") + return parseDirectiveWord(2, Loc); + if (IDVal == ".word") + return parseDirectiveWord(4, Loc); + if (IDVal == ".xword") + return parseDirectiveWord(8, Loc); + if (IDVal == ".tlsdesccall") + return parseDirectiveTLSDescCall(Loc); + if (IDVal == ".ltorg" || IDVal == ".pool") + return parseDirectiveLtorg(Loc); + if (IDVal == ".unreq") + return parseDirectiveUnreq(Loc); + + if (!IsMachO && !IsCOFF) { + if (IDVal == ".inst") + return parseDirectiveInst(Loc); + } + + return parseDirectiveLOH(IDVal, Loc); +} + +/// parseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().parseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size, L); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// parseDirectiveInst +/// ::= .inst opcode [, ...] +bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { + MCAsmParser &Parser = getParser(); + if (getLexer().is(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + Error(Loc, "expected expression following directive"); + return false; + } + + for (;;) { + const MCExpr *Expr; + + if (getParser().parseExpression(Expr)) { + Error(Loc, "expected expression"); + return false; + } + + const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); + if (!Value) { + Error(Loc, "expected constant expression"); + return false; + } + + getTargetStreamer().emitInst(Value->getValue()); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) { + Error(Loc, "unexpected token in directive"); + return false; + } + + Parser.Lex(); // Eat comma. + } + + Parser.Lex(); + return false; +} + +// parseDirectiveTLSDescCall: +// ::= .tlsdesccall symbol +bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return Error(L, "expected symbol after directive"); + + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext()); + Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_TLSDESC, getContext()); + + MCInst Inst; + Inst.setOpcode(AArch64::TLSDESCCALL); + Inst.addOperand(MCOperand::createExpr(Expr)); + + getParser().getStreamer().EmitInstruction(Inst, getSTI()); + return false; +} + +/// ::= .loh <lohName | lohId> label1, ..., labelN +/// The number of arguments depends on the loh identifier. +bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { + if (IDVal != MCLOHDirectiveName()) + return true; + MCLOHType Kind; + if (getParser().getTok().isNot(AsmToken::Identifier)) { + if (getParser().getTok().isNot(AsmToken::Integer)) + return TokError("expected an identifier or a number in directive"); + // We successfully get a numeric value for the identifier. + // Check if it is valid. + int64_t Id = getParser().getTok().getIntVal(); + if (Id <= -1U && !isValidMCLOHType(Id)) + return TokError("invalid numeric identifier in directive"); + Kind = (MCLOHType)Id; + } else { + StringRef Name = getTok().getIdentifier(); + // We successfully parse an identifier. + // Check if it is a recognized one. + int Id = MCLOHNameToId(Name); + + if (Id == -1) + return TokError("invalid identifier in directive"); + Kind = (MCLOHType)Id; + } + // Consume the identifier. + Lex(); + // Get the number of arguments of this LOH. + int NbArgs = MCLOHIdToNbArgs(Kind); + + assert(NbArgs != -1 && "Invalid number of arguments"); + + SmallVector<MCSymbol *, 3> Args; + for (int Idx = 0; Idx < NbArgs; ++Idx) { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + Args.push_back(getContext().getOrCreateSymbol(Name)); + + if (Idx + 1 == NbArgs) + break; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); + Lex(); + } + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); + + getStreamer().EmitLOHDirective((MCLOHType)Kind, Args); + return false; +} + +/// parseDirectiveLtorg +/// ::= .ltorg | .pool +bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) { + getTargetStreamer().emitCurrentConstantPool(); + return false; +} + +/// parseDirectiveReq +/// ::= name .req registername +bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { + MCAsmParser &Parser = getParser(); + Parser.Lex(); // Eat the '.req' token. + SMLoc SRegLoc = getLoc(); + unsigned RegNum = tryParseRegister(); + bool IsVector = false; + + if (RegNum == static_cast<unsigned>(-1)) { + StringRef Kind; + RegNum = tryMatchVectorRegister(Kind, false); + if (!Kind.empty()) { + Error(SRegLoc, "vector register without type specifier expected"); + return false; + } + IsVector = true; + } + + if (RegNum == static_cast<unsigned>(-1)) { + Parser.eatToEndOfStatement(); + Error(SRegLoc, "register name or alias expected"); + return false; + } + + // Shouldn't be anything else. + if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { + Error(Parser.getTok().getLoc(), "unexpected input in .req directive"); + Parser.eatToEndOfStatement(); + return false; + } + + Parser.Lex(); // Consume the EndOfStatement + + auto pair = std::make_pair(IsVector, RegNum); + if (RegisterReqs.insert(std::make_pair(Name, pair)).first->second != pair) + Warning(L, "ignoring redefinition of register alias '" + Name + "'"); + + return true; +} + +/// parseDirectiveUneq +/// ::= .unreq registername +bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive."); + Parser.eatToEndOfStatement(); + return false; + } + RegisterReqs.erase(Parser.getTok().getIdentifier().lower()); + Parser.Lex(); // Eat the identifier. + return false; +} + +bool +AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, + AArch64MCExpr::VariantKind &ELFRefKind, + MCSymbolRefExpr::VariantKind &DarwinRefKind, + int64_t &Addend) { + ELFRefKind = AArch64MCExpr::VK_INVALID; + DarwinRefKind = MCSymbolRefExpr::VK_None; + Addend = 0; + + if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) { + ELFRefKind = AE->getKind(); + Expr = AE->getSubExpr(); + } + + const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr); + if (SE) { + // It's a simple symbol reference with no addend. + DarwinRefKind = SE->getKind(); + return true; + } + + const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr); + if (!BE) + return false; + + SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS()); + if (!SE) + return false; + DarwinRefKind = SE->getKind(); + + if (BE->getOpcode() != MCBinaryExpr::Add && + BE->getOpcode() != MCBinaryExpr::Sub) + return false; + + // See if the addend is is a constant, otherwise there's more going + // on here than we can deal with. + auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS()); + if (!AddendExpr) + return false; + + Addend = AddendExpr->getValue(); + if (BE->getOpcode() == MCBinaryExpr::Sub) + Addend = -Addend; + + // It's some symbol reference + a constant addend, but really + // shouldn't use both Darwin and ELF syntax. + return ELFRefKind == AArch64MCExpr::VK_INVALID || + DarwinRefKind == MCSymbolRefExpr::VK_None; +} + +/// Force static initialization. +extern "C" void LLVMInitializeAArch64AsmParser() { + RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget); + RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget); + RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64Target); +} + +#define GET_REGISTER_MATCHER +#define GET_SUBTARGET_FEATURE_NAME +#define GET_MATCHER_IMPLEMENTATION +#include "AArch64GenAsmMatcher.inc" + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + AArch64Operand &Op = static_cast<AArch64Operand &>(AsmOp); + // If the kind is a token for a literal immediate, check if our asm + // operand matches. This is for InstAliases which have a fixed-value + // immediate in the syntax. + int64_t ExpectedVal; + switch (Kind) { + default: + return Match_InvalidOperand; + case MCK__35_0: + ExpectedVal = 0; + break; + case MCK__35_1: + ExpectedVal = 1; + break; + case MCK__35_12: + ExpectedVal = 12; + break; + case MCK__35_16: + ExpectedVal = 16; + break; + case MCK__35_2: + ExpectedVal = 2; + break; + case MCK__35_24: + ExpectedVal = 24; + break; + case MCK__35_3: + ExpectedVal = 3; + break; + case MCK__35_32: + ExpectedVal = 32; + break; + case MCK__35_4: + ExpectedVal = 4; + break; + case MCK__35_48: + ExpectedVal = 48; + break; + case MCK__35_6: + ExpectedVal = 6; + break; + case MCK__35_64: + ExpectedVal = 64; + break; + case MCK__35_8: + ExpectedVal = 8; + break; + } + if (!Op.isImm()) + return Match_InvalidOperand; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op.getImm()); + if (!CE) + return Match_InvalidOperand; + if (CE->getValue() == ExpectedVal) + return Match_Success; + return Match_InvalidOperand; +} + + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseGPRSeqPair(OperandVector &Operands) { + + SMLoc S = getLoc(); + + if (getParser().getTok().isNot(AsmToken::Identifier)) { + Error(S, "expected register"); + return MatchOperand_ParseFail; + } + + int FirstReg = tryParseRegister(); + if (FirstReg == -1) { + return MatchOperand_ParseFail; + } + const MCRegisterClass &WRegClass = + AArch64MCRegisterClasses[AArch64::GPR32RegClassID]; + const MCRegisterClass &XRegClass = + AArch64MCRegisterClasses[AArch64::GPR64RegClassID]; + + bool isXReg = XRegClass.contains(FirstReg), + isWReg = WRegClass.contains(FirstReg); + if (!isXReg && !isWReg) { + Error(S, "expected first even register of a " + "consecutive same-size even/odd register pair"); + return MatchOperand_ParseFail; + } + + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + unsigned FirstEncoding = RI->getEncodingValue(FirstReg); + + if (FirstEncoding & 0x1) { + Error(S, "expected first even register of a " + "consecutive same-size even/odd register pair"); + return MatchOperand_ParseFail; + } + + SMLoc M = getLoc(); + if (getParser().getTok().isNot(AsmToken::Comma)) { + Error(M, "expected comma"); + return MatchOperand_ParseFail; + } + // Eat the comma + getParser().Lex(); + + SMLoc E = getLoc(); + int SecondReg = tryParseRegister(); + if (SecondReg ==-1) { + return MatchOperand_ParseFail; + } + + if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 || + (isXReg && !XRegClass.contains(SecondReg)) || + (isWReg && !WRegClass.contains(SecondReg))) { + Error(E,"expected second odd register of a " + "consecutive same-size even/odd register pair"); + return MatchOperand_ParseFail; + } + + unsigned Pair = 0; + if(isXReg) { + Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube64, + &AArch64MCRegisterClasses[AArch64::XSeqPairsClassRegClassID]); + } else { + Pair = RI->getMatchingSuperReg(FirstReg, AArch64::sube32, + &AArch64MCRegisterClasses[AArch64::WSeqPairsClassRegClassID]); + } + + Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(), + getContext())); + + return MatchOperand_Success; +} diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp new file mode 100644 index 0000000..f1f968e --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -0,0 +1,1589 @@ +//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "AArch64Disassembler.h" +#include "AArch64ExternalSymbolizer.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-disassembler" + +// Pull DecodeStatus and its enum values into the global namespace. +typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; + +// Forward declare these because the autogenerated code will reference them. +// Definitions are further down. +static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Addr, + const void *Decoder); + +static bool Check(DecodeStatus &Out, DecodeStatus In) { + switch (In) { + case MCDisassembler::Success: + // Out stays the same. + return true; + case MCDisassembler::SoftFail: + Out = In; + return true; + case MCDisassembler::Fail: + Out = In; + return false; + } + llvm_unreachable("Invalid DecodeStatus!"); +} + +#include "AArch64GenDisassemblerTables.inc" +#include "AArch64GenInstrInfo.inc" + +#define Success llvm::MCDisassembler::Success +#define Fail llvm::MCDisassembler::Fail +#define SoftFail llvm::MCDisassembler::SoftFail + +static MCDisassembler *createAArch64Disassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new AArch64Disassembler(STI, Ctx); +} + +DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &OS, + raw_ostream &CS) const { + CommentStream = &CS; + + Size = 0; + // We want to read exactly 4 bytes of data. + if (Bytes.size() < 4) + return Fail; + Size = 4; + + // Encoded as a small-endian 32-bit word in the stream. + uint32_t Insn = + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); + + // Calling the auto-generated decoder function. + return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI); +} + +static MCSymbolizer * +createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo, + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo, MCContext *Ctx, + std::unique_ptr<MCRelocationInfo> &&RelInfo) { + return new llvm::AArch64ExternalSymbolizer(*Ctx, move(RelInfo), GetOpInfo, + SymbolLookUp, DisInfo); +} + +extern "C" void LLVMInitializeAArch64Disassembler() { + TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget, + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget, + createAArch64ExternalSymbolizer); + + TargetRegistry::RegisterMCDisassembler(TheARM64Target, + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(TheARM64Target, + createAArch64ExternalSymbolizer); +} + +static const unsigned FPR128DecoderTable[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9, + AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14, + AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19, + AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24, + AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29, + AArch64::Q30, AArch64::Q31 +}; + +static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR128DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 15) + return Fail; + return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); +} + +static const unsigned FPR64DecoderTable[] = { + AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4, + AArch64::D5, AArch64::D6, AArch64::D7, AArch64::D8, AArch64::D9, + AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14, + AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19, + AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24, + AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29, + AArch64::D30, AArch64::D31 +}; + +static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned FPR32DecoderTable[] = { + AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4, + AArch64::S5, AArch64::S6, AArch64::S7, AArch64::S8, AArch64::S9, + AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14, + AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19, + AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24, + AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29, + AArch64::S30, AArch64::S31 +}; + +static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR32DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned FPR16DecoderTable[] = { + AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4, + AArch64::H5, AArch64::H6, AArch64::H7, AArch64::H8, AArch64::H9, + AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14, + AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19, + AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24, + AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29, + AArch64::H30, AArch64::H31 +}; + +static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR16DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned FPR8DecoderTable[] = { + AArch64::B0, AArch64::B1, AArch64::B2, AArch64::B3, AArch64::B4, + AArch64::B5, AArch64::B6, AArch64::B7, AArch64::B8, AArch64::B9, + AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14, + AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19, + AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24, + AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29, + AArch64::B30, AArch64::B31 +}; + +static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR8DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned GPR64DecoderTable[] = { + AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9, + AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14, + AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19, + AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24, + AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP, + AArch64::LR, AArch64::XZR +}; + +static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = GPR64DecoderTable[RegNo]; + if (Register == AArch64::XZR) + Register = AArch64::SP; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned GPR32DecoderTable[] = { + AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4, + AArch64::W5, AArch64::W6, AArch64::W7, AArch64::W8, AArch64::W9, + AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14, + AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19, + AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24, + AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29, + AArch64::W30, AArch64::WZR +}; + +static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR32DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR32DecoderTable[RegNo]; + if (Register == AArch64::WZR) + Register = AArch64::WSP; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned VectorDecoderTable[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9, + AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14, + AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19, + AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24, + AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29, + AArch64::Q30, AArch64::Q31 +}; + +static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = VectorDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned QQDecoderTable[] = { + AArch64::Q0_Q1, AArch64::Q1_Q2, AArch64::Q2_Q3, AArch64::Q3_Q4, + AArch64::Q4_Q5, AArch64::Q5_Q6, AArch64::Q6_Q7, AArch64::Q7_Q8, + AArch64::Q8_Q9, AArch64::Q9_Q10, AArch64::Q10_Q11, AArch64::Q11_Q12, + AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16, + AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20, + AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24, + AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28, + AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0 +}; + +static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned QQQDecoderTable[] = { + AArch64::Q0_Q1_Q2, AArch64::Q1_Q2_Q3, AArch64::Q2_Q3_Q4, + AArch64::Q3_Q4_Q5, AArch64::Q4_Q5_Q6, AArch64::Q5_Q6_Q7, + AArch64::Q6_Q7_Q8, AArch64::Q7_Q8_Q9, AArch64::Q8_Q9_Q10, + AArch64::Q9_Q10_Q11, AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13, + AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16, + AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19, + AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22, + AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25, + AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28, + AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31, + AArch64::Q30_Q31_Q0, AArch64::Q31_Q0_Q1 +}; + +static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned QQQQDecoderTable[] = { + AArch64::Q0_Q1_Q2_Q3, AArch64::Q1_Q2_Q3_Q4, AArch64::Q2_Q3_Q4_Q5, + AArch64::Q3_Q4_Q5_Q6, AArch64::Q4_Q5_Q6_Q7, AArch64::Q5_Q6_Q7_Q8, + AArch64::Q6_Q7_Q8_Q9, AArch64::Q7_Q8_Q9_Q10, AArch64::Q8_Q9_Q10_Q11, + AArch64::Q9_Q10_Q11_Q12, AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14, + AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17, + AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20, + AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23, + AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26, + AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29, + AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0, + AArch64::Q30_Q31_Q0_Q1, AArch64::Q31_Q0_Q1_Q2 +}; + +static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQQQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned DDDecoderTable[] = { + AArch64::D0_D1, AArch64::D1_D2, AArch64::D2_D3, AArch64::D3_D4, + AArch64::D4_D5, AArch64::D5_D6, AArch64::D6_D7, AArch64::D7_D8, + AArch64::D8_D9, AArch64::D9_D10, AArch64::D10_D11, AArch64::D11_D12, + AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16, + AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20, + AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24, + AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28, + AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0 +}; + +static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned DDDDecoderTable[] = { + AArch64::D0_D1_D2, AArch64::D1_D2_D3, AArch64::D2_D3_D4, + AArch64::D3_D4_D5, AArch64::D4_D5_D6, AArch64::D5_D6_D7, + AArch64::D6_D7_D8, AArch64::D7_D8_D9, AArch64::D8_D9_D10, + AArch64::D9_D10_D11, AArch64::D10_D11_D12, AArch64::D11_D12_D13, + AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16, + AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19, + AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22, + AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25, + AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28, + AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31, + AArch64::D30_D31_D0, AArch64::D31_D0_D1 +}; + +static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static const unsigned DDDDDecoderTable[] = { + AArch64::D0_D1_D2_D3, AArch64::D1_D2_D3_D4, AArch64::D2_D3_D4_D5, + AArch64::D3_D4_D5_D6, AArch64::D4_D5_D6_D7, AArch64::D5_D6_D7_D8, + AArch64::D6_D7_D8_D9, AArch64::D7_D8_D9_D10, AArch64::D8_D9_D10_D11, + AArch64::D9_D10_D11_D12, AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14, + AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17, + AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20, + AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23, + AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26, + AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29, + AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0, + AArch64::D30_D31_D0_D1, AArch64::D31_D0_D1_D2 +}; + +static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + // scale{5} is asserted as 1 in tblgen. + Imm |= 0x20; + Inst.addOperand(MCOperand::createImm(64 - Imm)); + return Success; +} + +static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm(64 - Imm)); + return Success; +} + +static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + int64_t ImmVal = Imm; + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + // Sign-extend 19-bit immediate. + if (ImmVal & (1 << (19 - 1))) + ImmVal |= ~((1LL << 19) - 1); + + if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr, + Inst.getOpcode() != AArch64::LDRXl, 0, 4)) + Inst.addOperand(MCOperand::createImm(ImmVal)); + return Success; +} + +static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::createImm((Imm >> 1) & 1)); + Inst.addOperand(MCOperand::createImm(Imm & 1)); + return Success; +} + +static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm(Imm)); + + // Every system register in the encoding space is valid with the syntax + // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds. + return Success; +} + +static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm(Imm)); + + return Success; +} + +static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + // This decoder exists to add the dummy Lane operand to the MCInst, which must + // be 1 in assembly but has no other real manifestation. + unsigned Rd = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); + + if (IsToVec) { + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); + } else { + DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); + } + + // Add the lane + Inst.addOperand(MCOperand::createImm(1)); + + return Success; +} + +static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm, + unsigned Add) { + Inst.addOperand(MCOperand::createImm(Add - Imm)); + return Success; +} + +static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm, + unsigned Add) { + Inst.addOperand(MCOperand::createImm((Imm + Add) & (Add - 1))); + return Success; +} + +static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 64); +} + +static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x20, 64); +} + +static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 32); +} + +static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x10, 32); +} + +static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 16); +} + +static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x8, 16); +} + +static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 8); +} + +static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 64); +} + +static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 32); +} + +static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 16); +} + +static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 8); +} + +static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rm = fieldFromInstruction(insn, 16, 5); + unsigned shiftHi = fieldFromInstruction(insn, 22, 2); + unsigned shiftLo = fieldFromInstruction(insn, 10, 6); + unsigned shift = (shiftHi << 6) | shiftLo; + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::ADDWrs: + case AArch64::ADDSWrs: + case AArch64::SUBWrs: + case AArch64::SUBSWrs: + // if shift == '11' then ReservedValue() + if (shiftHi == 0x3) + return Fail; + // Deliberate fallthrough + case AArch64::ANDWrs: + case AArch64::ANDSWrs: + case AArch64::BICWrs: + case AArch64::BICSWrs: + case AArch64::ORRWrs: + case AArch64::ORNWrs: + case AArch64::EORWrs: + case AArch64::EONWrs: { + // if sf == '0' and imm6<5> == '1' then ReservedValue() + if (shiftLo >> 5 == 1) + return Fail; + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + case AArch64::ADDXrs: + case AArch64::ADDSXrs: + case AArch64::SUBXrs: + case AArch64::SUBSXrs: + // if shift == '11' then ReservedValue() + if (shiftHi == 0x3) + return Fail; + // Deliberate fallthrough + case AArch64::ANDXrs: + case AArch64::ANDSXrs: + case AArch64::BICXrs: + case AArch64::BICSXrs: + case AArch64::ORRXrs: + case AArch64::ORNXrs: + case AArch64::EORXrs: + case AArch64::EONXrs: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + + Inst.addOperand(MCOperand::createImm(shift)); + return Success; +} + +static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned imm = fieldFromInstruction(insn, 5, 16); + unsigned shift = fieldFromInstruction(insn, 21, 2); + shift <<= 4; + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::MOVZWi: + case AArch64::MOVNWi: + case AArch64::MOVKWi: + if (shift & (1U << 5)) + return Fail; + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + break; + case AArch64::MOVZXi: + case AArch64::MOVNXi: + case AArch64::MOVKXi: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + break; + } + + if (Inst.getOpcode() == AArch64::MOVKWi || + Inst.getOpcode() == AArch64::MOVKXi) + Inst.addOperand(Inst.getOperand(0)); + + Inst.addOperand(MCOperand::createImm(imm)); + Inst.addOperand(MCOperand::createImm(shift)); + return Success; +} + +static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned offset = fieldFromInstruction(insn, 10, 12); + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::PRFMui: + // Rt is an immediate in prefetch. + Inst.addOperand(MCOperand::createImm(Rt)); + break; + case AArch64::STRBBui: + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + case AArch64::STRHHui: + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + case AArch64::STRWui: + case AArch64::LDRWui: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::STRXui: + case AArch64::LDRXui: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRQui: + case AArch64::STRQui: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRDui: + case AArch64::STRDui: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRSui: + case AArch64::STRSui: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRHui: + case AArch64::STRHui: + DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRBui: + case AArch64::STRBui: + DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::createImm(offset)); + return Success; +} + +static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + int64_t offset = fieldFromInstruction(insn, 12, 9); + + // offset is a 9-bit signed immediate, so sign extend it to + // fill the unsigned. + if (offset & (1 << (9 - 1))) + offset |= ~((1LL << 9) - 1); + + // First operand is always the writeback to the address register, if needed. + switch (Inst.getOpcode()) { + default: + break; + case AArch64::LDRSBWpre: + case AArch64::LDRSHWpre: + case AArch64::STRBBpre: + case AArch64::LDRBBpre: + case AArch64::STRHHpre: + case AArch64::LDRHHpre: + case AArch64::STRWpre: + case AArch64::LDRWpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSHWpost: + case AArch64::STRBBpost: + case AArch64::LDRBBpost: + case AArch64::STRHHpost: + case AArch64::LDRHHpost: + case AArch64::STRWpost: + case AArch64::LDRWpost: + case AArch64::LDRSBXpre: + case AArch64::LDRSHXpre: + case AArch64::STRXpre: + case AArch64::LDRSWpre: + case AArch64::LDRXpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSHXpost: + case AArch64::STRXpost: + case AArch64::LDRSWpost: + case AArch64::LDRXpost: + case AArch64::LDRQpre: + case AArch64::STRQpre: + case AArch64::LDRQpost: + case AArch64::STRQpost: + case AArch64::LDRDpre: + case AArch64::STRDpre: + case AArch64::LDRDpost: + case AArch64::STRDpost: + case AArch64::LDRSpre: + case AArch64::STRSpre: + case AArch64::LDRSpost: + case AArch64::STRSpost: + case AArch64::LDRHpre: + case AArch64::STRHpre: + case AArch64::LDRHpost: + case AArch64::STRHpost: + case AArch64::LDRBpre: + case AArch64::STRBpre: + case AArch64::LDRBpost: + case AArch64::STRBpost: + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + break; + } + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::PRFUMi: + // Rt is an immediate in prefetch. + Inst.addOperand(MCOperand::createImm(Rt)); + break; + case AArch64::STURBBi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::STURHHi: + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + case AArch64::STURWi: + case AArch64::LDURWi: + case AArch64::LDTRSBWi: + case AArch64::LDTRSHWi: + case AArch64::STTRWi: + case AArch64::LDTRWi: + case AArch64::STTRHi: + case AArch64::LDTRHi: + case AArch64::LDTRBi: + case AArch64::STTRBi: + case AArch64::LDRSBWpre: + case AArch64::LDRSHWpre: + case AArch64::STRBBpre: + case AArch64::LDRBBpre: + case AArch64::STRHHpre: + case AArch64::LDRHHpre: + case AArch64::STRWpre: + case AArch64::LDRWpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSHWpost: + case AArch64::STRBBpost: + case AArch64::LDRBBpost: + case AArch64::STRHHpost: + case AArch64::LDRHHpost: + case AArch64::STRWpost: + case AArch64::LDRWpost: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::STURXi: + case AArch64::LDURXi: + case AArch64::LDTRSBXi: + case AArch64::LDTRSHXi: + case AArch64::LDTRSWi: + case AArch64::STTRXi: + case AArch64::LDTRXi: + case AArch64::LDRSBXpre: + case AArch64::LDRSHXpre: + case AArch64::STRXpre: + case AArch64::LDRSWpre: + case AArch64::LDRXpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSHXpost: + case AArch64::STRXpost: + case AArch64::LDRSWpost: + case AArch64::LDRXpost: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDRQpre: + case AArch64::STRQpre: + case AArch64::LDRQpost: + case AArch64::STRQpost: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURDi: + case AArch64::STURDi: + case AArch64::LDRDpre: + case AArch64::STRDpre: + case AArch64::LDRDpost: + case AArch64::STRDpost: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURSi: + case AArch64::STURSi: + case AArch64::LDRSpre: + case AArch64::STRSpre: + case AArch64::LDRSpost: + case AArch64::STRSpost: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURHi: + case AArch64::STURHi: + case AArch64::LDRHpre: + case AArch64::STRHpre: + case AArch64::LDRHpost: + case AArch64::STRHpost: + DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURBi: + case AArch64::STURBi: + case AArch64::LDRBpre: + case AArch64::STRBpre: + case AArch64::LDRBpost: + case AArch64::STRBpost: + DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + Inst.addOperand(MCOperand::createImm(offset)); + + bool IsLoad = fieldFromInstruction(insn, 22, 1); + bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0; + bool IsFP = fieldFromInstruction(insn, 26, 1); + + // Cannot write back to a transfer register (but xzr != sp). + if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rt2 = fieldFromInstruction(insn, 10, 5); + unsigned Rs = fieldFromInstruction(insn, 16, 5); + + unsigned Opcode = Inst.getOpcode(); + switch (Opcode) { + default: + return Fail; + case AArch64::STLXRW: + case AArch64::STLXRB: + case AArch64::STLXRH: + case AArch64::STXRW: + case AArch64::STXRB: + case AArch64::STXRH: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDARW: + case AArch64::LDARB: + case AArch64::LDARH: + case AArch64::LDAXRW: + case AArch64::LDAXRB: + case AArch64::LDAXRH: + case AArch64::LDXRW: + case AArch64::LDXRB: + case AArch64::LDXRH: + case AArch64::STLRW: + case AArch64::STLRB: + case AArch64::STLRH: + case AArch64::STLLRW: + case AArch64::STLLRB: + case AArch64::STLLRH: + case AArch64::LDLARW: + case AArch64::LDLARB: + case AArch64::LDLARH: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::STLXRX: + case AArch64::STXRX: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDARX: + case AArch64::LDAXRX: + case AArch64::LDXRX: + case AArch64::STLRX: + case AArch64::LDLARX: + case AArch64::STLLRX: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::STLXPW: + case AArch64::STXPW: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDAXPW: + case AArch64::LDXPW: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::STLXPX: + case AArch64::STXPX: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDAXPX: + case AArch64::LDXPX: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + + // You shouldn't load to the same register twice in an instruction... + if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW || + Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) && + Rt == Rt2) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rt2 = fieldFromInstruction(insn, 10, 5); + int64_t offset = fieldFromInstruction(insn, 15, 7); + bool IsLoad = fieldFromInstruction(insn, 22, 1); + + // offset is a 7-bit signed immediate, so sign extend it to + // fill the unsigned. + if (offset & (1 << (7 - 1))) + offset |= ~((1LL << 7) - 1); + + unsigned Opcode = Inst.getOpcode(); + bool NeedsDisjointWritebackTransfer = false; + + // First operand is always writeback of base register. + switch (Opcode) { + default: + break; + case AArch64::LDPXpost: + case AArch64::STPXpost: + case AArch64::LDPSWpost: + case AArch64::LDPXpre: + case AArch64::STPXpre: + case AArch64::LDPSWpre: + case AArch64::LDPWpost: + case AArch64::STPWpost: + case AArch64::LDPWpre: + case AArch64::STPWpre: + case AArch64::LDPQpost: + case AArch64::STPQpost: + case AArch64::LDPQpre: + case AArch64::STPQpre: + case AArch64::LDPDpost: + case AArch64::STPDpost: + case AArch64::LDPDpre: + case AArch64::STPDpre: + case AArch64::LDPSpost: + case AArch64::STPSpost: + case AArch64::LDPSpre: + case AArch64::STPSpre: + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + break; + } + + switch (Opcode) { + default: + return Fail; + case AArch64::LDPXpost: + case AArch64::STPXpost: + case AArch64::LDPSWpost: + case AArch64::LDPXpre: + case AArch64::STPXpre: + case AArch64::LDPSWpre: + NeedsDisjointWritebackTransfer = true; + // Fallthrough + case AArch64::LDNPXi: + case AArch64::STNPXi: + case AArch64::LDPXi: + case AArch64::STPXi: + case AArch64::LDPSWi: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDPWpost: + case AArch64::STPWpost: + case AArch64::LDPWpre: + case AArch64::STPWpre: + NeedsDisjointWritebackTransfer = true; + // Fallthrough + case AArch64::LDNPWi: + case AArch64::STNPWi: + case AArch64::LDPWi: + case AArch64::STPWi: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPQi: + case AArch64::STNPQi: + case AArch64::LDPQpost: + case AArch64::STPQpost: + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::LDPQpre: + case AArch64::STPQpre: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPDi: + case AArch64::STNPDi: + case AArch64::LDPDpost: + case AArch64::STPDpost: + case AArch64::LDPDi: + case AArch64::STPDi: + case AArch64::LDPDpre: + case AArch64::STPDpre: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPSi: + case AArch64::STNPSi: + case AArch64::LDPSpost: + case AArch64::STPSpost: + case AArch64::LDPSi: + case AArch64::STPSi: + case AArch64::LDPSpre: + case AArch64::STPSpre: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + Inst.addOperand(MCOperand::createImm(offset)); + + // You shouldn't load to the same register twice in an instruction... + if (IsLoad && Rt == Rt2) + return SoftFail; + + // ... or do any operation that writes-back to a transfer register. But note + // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different. + if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn)) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rm = fieldFromInstruction(insn, 16, 5); + unsigned extend = fieldFromInstruction(insn, 10, 6); + + unsigned shift = extend & 0x7; + if (shift > 4) + return Fail; + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::ADDWrx: + case AArch64::SUBWrx: + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDSWrx: + case AArch64::SUBSWrx: + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDXrx: + case AArch64::SUBXrx: + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDSXrx: + case AArch64::SUBSXrx: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDXrx64: + case AArch64::SUBXrx64: + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::SUBSXrx64: + case AArch64::ADDSXrx64: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + + Inst.addOperand(MCOperand::createImm(extend)); + return Success; +} + +static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Datasize = fieldFromInstruction(insn, 31, 1); + unsigned imm; + + if (Datasize) { + if (Inst.getOpcode() == AArch64::ANDSXri) + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); + imm = fieldFromInstruction(insn, 10, 13); + if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64)) + return Fail; + } else { + if (Inst.getOpcode() == AArch64::ANDSWri) + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); + imm = fieldFromInstruction(insn, 10, 12); + if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32)) + return Fail; + } + Inst.addOperand(MCOperand::createImm(imm)); + return Success; +} + +static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned cmode = fieldFromInstruction(insn, 12, 4); + unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; + imm |= fieldFromInstruction(insn, 5, 5); + + if (Inst.getOpcode() == AArch64::MOVID) + DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + + Inst.addOperand(MCOperand::createImm(imm)); + + switch (Inst.getOpcode()) { + default: + break; + case AArch64::MOVIv4i16: + case AArch64::MOVIv8i16: + case AArch64::MVNIv4i16: + case AArch64::MVNIv8i16: + case AArch64::MOVIv2i32: + case AArch64::MOVIv4i32: + case AArch64::MVNIv2i32: + case AArch64::MVNIv4i32: + Inst.addOperand(MCOperand::createImm((cmode & 6) << 2)); + break; + case AArch64::MOVIv2s_msl: + case AArch64::MOVIv4s_msl: + case AArch64::MVNIv2s_msl: + case AArch64::MVNIv4s_msl: + Inst.addOperand(MCOperand::createImm(cmode & 1 ? 0x110 : 0x108)); + break; + } + + return Success; +} + +static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned cmode = fieldFromInstruction(insn, 12, 4); + unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; + imm |= fieldFromInstruction(insn, 5, 5); + + // Tied operands added twice. + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + + Inst.addOperand(MCOperand::createImm(imm)); + Inst.addOperand(MCOperand::createImm((cmode & 6) << 2)); + + return Success; +} + +static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; + imm |= fieldFromInstruction(insn, 29, 2); + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + // Sign-extend the 21-bit immediate. + if (imm & (1 << (21 - 1))) + imm |= ~((1LL << 21) - 1); + + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::createImm(imm)); + + return Success; +} + +static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Imm = fieldFromInstruction(insn, 10, 14); + unsigned S = fieldFromInstruction(insn, 29, 1); + unsigned Datasize = fieldFromInstruction(insn, 31, 1); + + unsigned ShifterVal = (Imm >> 12) & 3; + unsigned ImmVal = Imm & 0xFFF; + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + if (ShifterVal != 0 && ShifterVal != 1) + return Fail; + + if (Datasize) { + if (Rd == 31 && !S) + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + } else { + if (Rd == 31 && !S) + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + } + + if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::createImm(ImmVal)); + Inst.addOperand(MCOperand::createImm(12 * ShifterVal)); + return Success; +} + +static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + int64_t imm = fieldFromInstruction(insn, 0, 26); + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + // Sign-extend the 26-bit immediate. + if (imm & (1 << (26 - 1))) + imm |= ~((1LL << 26) - 1); + + if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4)) + Inst.addOperand(MCOperand::createImm(imm)); + + return Success; +} + +static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + uint64_t op1 = fieldFromInstruction(insn, 16, 3); + uint64_t op2 = fieldFromInstruction(insn, 5, 3); + uint64_t crm = fieldFromInstruction(insn, 8, 4); + + uint64_t pstate_field = (op1 << 3) | op2; + + if ((pstate_field == AArch64PState::PAN || + pstate_field == AArch64PState::UAO) && crm > 1) + return Fail; + + Inst.addOperand(MCOperand::createImm(pstate_field)); + Inst.addOperand(MCOperand::createImm(crm)); + + bool ValidNamed; + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + (void)AArch64PState::PStateMapper().toString(pstate_field, + Dis->getSubtargetInfo().getFeatureBits(), ValidNamed); + + return ValidNamed ? Success : Fail; +} + +static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + uint64_t Rt = fieldFromInstruction(insn, 0, 5); + uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; + bit |= fieldFromInstruction(insn, 19, 5); + int64_t dst = fieldFromInstruction(insn, 5, 14); + const AArch64Disassembler *Dis = + static_cast<const AArch64Disassembler *>(Decoder); + + // Sign-extend 14-bit immediate. + if (dst & (1 << (14 - 1))) + dst |= ~((1LL << 14) - 1); + + if (fieldFromInstruction(insn, 31, 1) == 0) + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + else + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + Inst.addOperand(MCOperand::createImm(bit)); + if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4)) + Inst.addOperand(MCOperand::createImm(dst)); + + return Success; +} + +static DecodeStatus DecodeGPRSeqPairsClassRegisterClass(MCInst &Inst, + unsigned RegClassID, + unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + // Register number must be even (see CASP instruction) + if (RegNo & 0x1) + return Fail; + + unsigned Register = AArch64MCRegisterClasses[RegClassID].getRegister(RegNo); + Inst.addOperand(MCOperand::createReg(Register)); + return Success; +} + +static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + return DecodeGPRSeqPairsClassRegisterClass(Inst, + AArch64::WSeqPairsClassRegClassID, + RegNo, Addr, Decoder); +} + +static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst, + unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + return DecodeGPRSeqPairsClassRegisterClass(Inst, + AArch64::XSeqPairsClassRegClassID, + RegNo, Addr, Decoder); +} diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h new file mode 100644 index 0000000..7fb57ad --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -0,0 +1,39 @@ +//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H +#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H + +#include "llvm/MC/MCDisassembler.h" + +namespace llvm { + +class MCInst; +class MemoryObject; +class raw_ostream; + +class AArch64Disassembler : public MCDisassembler { +public: + AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx) + : MCDisassembler(STI, Ctx) {} + + ~AArch64Disassembler() {} + + MCDisassembler::DecodeStatus + getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, + uint64_t Address, raw_ostream &VStream, + raw_ostream &CStream) const override; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp new file mode 100644 index 0000000..82bc949 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -0,0 +1,220 @@ +//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64ExternalSymbolizer.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-disassembler" + +static MCSymbolRefExpr::VariantKind +getVariant(uint64_t LLVMDisassembler_VariantKind) { + switch (LLVMDisassembler_VariantKind) { + case LLVMDisassembler_VariantKind_None: + return MCSymbolRefExpr::VK_None; + case LLVMDisassembler_VariantKind_ARM64_PAGE: + return MCSymbolRefExpr::VK_PAGE; + case LLVMDisassembler_VariantKind_ARM64_PAGEOFF: + return MCSymbolRefExpr::VK_PAGEOFF; + case LLVMDisassembler_VariantKind_ARM64_GOTPAGE: + return MCSymbolRefExpr::VK_GOTPAGE; + case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF: + return MCSymbolRefExpr::VK_GOTPAGEOFF; + case LLVMDisassembler_VariantKind_ARM64_TLVP: + case LLVMDisassembler_VariantKind_ARM64_TLVOFF: + default: + llvm_unreachable("bad LLVMDisassembler_VariantKind"); + } +} + +/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic +/// operand in place of the immediate Value in the MCInst. The immediate +/// Value has not had any PC adjustment made by the caller. If the instruction +/// is a branch that adds the PC to the immediate Value then isBranch is +/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any +/// symbolic information at the Address for this instrution. If that returns +/// non-zero then the symbolic information it returns is used to create an +/// MCExpr and that is added as an operand to the MCInst. If GetOpInfo() +/// returns zero and isBranch is Success then a symbol look up for +/// Address + Value is done and if a symbol is found an MCExpr is created with +/// that, else an MCExpr with Address + Value is created. If GetOpInfo() +/// returns zero and isBranch is Fail then the Opcode of the MCInst is +/// tested and for ADRP an other instructions that help to load of pointers +/// a symbol look up is done to see it is returns a specific reference type +/// to add to the comment stream. This function returns Success if it adds +/// an operand to the MCInst and Fail otherwise. +bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( + MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address, + bool IsBranch, uint64_t Offset, uint64_t InstSize) { + // FIXME: This method shares a lot of code with + // MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible + // refactor the MCExternalSymbolizer interface to allow more of this + // implementation to be shared. + // + struct LLVMOpInfo1 SymbolicOp; + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + SymbolicOp.Value = Value; + uint64_t ReferenceType; + const char *ReferenceName; + if (!GetOpInfo || + !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) { + if (IsBranch) { + ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; + const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, + Address, &ReferenceName); + if (Name) { + SymbolicOp.AddSymbol.Name = Name; + SymbolicOp.AddSymbol.Present = true; + SymbolicOp.Value = 0; + } else { + SymbolicOp.Value = Address + Value; + } + if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) + CommentStream << "symbol stub for: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message) + CommentStream << "Objc message: " << ReferenceName; + } else if (MI.getOpcode() == AArch64::ADRP) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP; + // otool expects the fully encoded ADRP instruction to be passed in as + // the value here, so reconstruct it: + const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); + uint32_t EncodedInst = 0x90000000; + EncodedInst |= (Value & 0x3) << 29; // immlo + EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi + EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg + SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, + &ReferenceName); + CommentStream << format("0x%llx", + 0xfffffffffffff000LL & (Address + Value)); + } else if (MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRXl || + MI.getOpcode() == AArch64::ADR) { + if (MI.getOpcode() == AArch64::ADDXri) + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri; + else if (MI.getOpcode() == AArch64::LDRXui) + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui; + if (MI.getOpcode() == AArch64::LDRXl) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl; + SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, + &ReferenceName); + } else if (MI.getOpcode() == AArch64::ADR) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR; + SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, + &ReferenceName); + } else { + const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); + // otool expects the fully encoded ADD/LDR instruction to be passed in + // as the value here, so reconstruct it: + unsigned EncodedInst = + MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000; + EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD] + EncodedInst |= + MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn + EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd + + SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, + &ReferenceName); + } + if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr) + CommentStream << "literal pool symbol address: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) + CommentStream << "literal pool for: \"" << ReferenceName << "\""; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref) + CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\""; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message) + CommentStream << "Objc message: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref) + CommentStream << "Objc message ref: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref) + CommentStream << "Objc selector ref: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref) + CommentStream << "Objc class ref: " << ReferenceName; + // For these instructions, the SymbolLookUp() above is just to get the + // ReferenceType and ReferenceName. We want to make sure not to + // fall through so we don't build an MCExpr to leave the disassembly + // of the immediate values of these instructions to the InstPrinter. + return false; + } else { + return false; + } + } + + const MCExpr *Add = nullptr; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Name); + MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind); + if (Variant != MCSymbolRefExpr::VK_None) + Add = MCSymbolRefExpr::create(Sym, Variant, Ctx); + else + Add = MCSymbolRefExpr::create(Sym, Ctx); + } else { + Add = MCConstantExpr::create(SymbolicOp.AddSymbol.Value, Ctx); + } + } + + const MCExpr *Sub = nullptr; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::create(Sym, Ctx); + } else { + Sub = MCConstantExpr::create(SymbolicOp.SubtractSymbol.Value, Ctx); + } + } + + const MCExpr *Off = nullptr; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::create(SymbolicOp.Value, Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::createSub(Add, Sub, Ctx); + else + LHS = MCUnaryExpr::createMinus(Sub, Ctx); + if (Off) + Expr = MCBinaryExpr::createAdd(LHS, Off, Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off) + Expr = MCBinaryExpr::createAdd(Add, Off, Ctx); + else + Expr = Add; + } else { + if (Off) + Expr = Off; + else + Expr = MCConstantExpr::create(0, Ctx); + } + + MI.addOperand(MCOperand::createExpr(Expr)); + + return true; +} diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h new file mode 100644 index 0000000..12b8450 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -0,0 +1,38 @@ +//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Symbolize AArch64 assembly code during disassembly using callbacks. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H +#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H + +#include "llvm/MC/MCExternalSymbolizer.h" + +namespace llvm { + +class AArch64ExternalSymbolizer : public MCExternalSymbolizer { +public: + AArch64ExternalSymbolizer(MCContext &Ctx, + std::unique_ptr<MCRelocationInfo> RelInfo, + LLVMOpInfoCallback GetOpInfo, + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo) + : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp, + DisInfo) {} + + bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream, + int64_t Value, uint64_t Address, bool IsBranch, + uint64_t Offset, uint64_t InstSize) override; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp new file mode 100644 index 0000000..d8a8108 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -0,0 +1,1408 @@ +//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter.inc" +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter1.inc" + +AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + +AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : AArch64InstPrinter(MAI, MII, MRI) {} + +void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + // This is for .cfi directives. + OS << getRegisterName(RegNo); +} + +void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, + const MCSubtargetInfo &STI) { + // Check for special encodings and print the canonical alias instead. + + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AArch64::SYSxt) + if (printSysAlias(MI, STI, O)) { + printAnnotation(O, Annot); + return; + } + + // SBFM/UBFM should print to a nicer aliased form if possible. + if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri || + Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) { + const MCOperand &Op0 = MI->getOperand(0); + const MCOperand &Op1 = MI->getOperand(1); + const MCOperand &Op2 = MI->getOperand(2); + const MCOperand &Op3 = MI->getOperand(3); + + bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri); + bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri); + if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + + switch (Op3.getImm()) { + default: + break; + case 7: + if (IsSigned) + AsmMnemonic = "sxtb"; + else if (!Is64Bit) + AsmMnemonic = "uxtb"; + break; + case 15: + if (IsSigned) + AsmMnemonic = "sxth"; + else if (!Is64Bit) + AsmMnemonic = "uxth"; + break; + case 31: + // *xtw is only valid for signed 64-bit operations. + if (Is64Bit && IsSigned) + AsmMnemonic = "sxtw"; + break; + } + + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(getWRegFromXReg(Op1.getReg())); + printAnnotation(O, Annot); + return; + } + } + + // All immediate shifts are aliases, implemented using the Bitfield + // instruction. In all cases the immediate shift amount shift must be in + // the range 0 to (reg.size -1). + if (Op2.isImm() && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + int shift = 0; + int64_t immr = Op2.getImm(); + int64_t imms = Op3.getImm(); + if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) { + AsmMnemonic = "lsl"; + shift = 31 - imms; + } else if (Opcode == AArch64::UBFMXri && imms != 0x3f && + ((imms + 1 == immr))) { + AsmMnemonic = "lsl"; + shift = 63 - imms; + } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) { + AsmMnemonic = "asr"; + shift = immr; + } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) { + AsmMnemonic = "asr"; + shift = immr; + } + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(Op1.getReg()) << ", #" << shift; + printAnnotation(O, Annot); + return; + } + } + + // SBFIZ/UBFIZ aliases + if (Op2.getImm() > Op3.getImm()) { + O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + // Otherwise SBFX/UBFX is the preferred form + O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) { + const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0 + const MCOperand &Op2 = MI->getOperand(2); + int ImmR = MI->getOperand(3).getImm(); + int ImmS = MI->getOperand(4).getImm(); + + if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && + (ImmR == 0 || ImmS < ImmR)) { + // BFC takes precedence over its entire range, sligtly differently to BFI. + int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; + int LSB = (BitWidth - ImmR) % BitWidth; + int Width = ImmS + 1; + + O << "\tbfc\t" << getRegisterName(Op0.getReg()) + << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } else if (ImmS < ImmR) { + // BFI alias + int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; + int LSB = (BitWidth - ImmR) % BitWidth; + int Width = ImmS + 1; + + O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", " + << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + int LSB = ImmR; + int Width = ImmS - ImmR + 1; + // Otherwise BFXIL the preferred form + O << "\tbfxil\t" + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg()) + << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift + // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be + // printed. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi || + Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isExpr()) { + if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) + O << "\tmovz\t"; + else + O << "\tmovn\t"; + + O << getRegisterName(MI->getOperand(0).getReg()) << ", #"; + MI->getOperand(1).getExpr()->print(O, &MAI); + return; + } + + if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) && + MI->getOperand(2).isExpr()) { + O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"; + MI->getOperand(2).getExpr()->print(O, &MAI); + return; + } + + if (!printAliasInstr(MI, STI, O)) + printInstruction(MI, STI, O); + + printAnnotation(O, Annot); +} + +static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout, + bool &IsTbx) { + switch (Opcode) { + case AArch64::TBXv8i8One: + case AArch64::TBXv8i8Two: + case AArch64::TBXv8i8Three: + case AArch64::TBXv8i8Four: + IsTbx = true; + Layout = ".8b"; + return true; + case AArch64::TBLv8i8One: + case AArch64::TBLv8i8Two: + case AArch64::TBLv8i8Three: + case AArch64::TBLv8i8Four: + IsTbx = false; + Layout = ".8b"; + return true; + case AArch64::TBXv16i8One: + case AArch64::TBXv16i8Two: + case AArch64::TBXv16i8Three: + case AArch64::TBXv16i8Four: + IsTbx = true; + Layout = ".16b"; + return true; + case AArch64::TBLv16i8One: + case AArch64::TBLv16i8Two: + case AArch64::TBLv16i8Three: + case AArch64::TBLv16i8Four: + IsTbx = false; + Layout = ".16b"; + return true; + default: + return false; + } +} + +struct LdStNInstrDesc { + unsigned Opcode; + const char *Mnemonic; + const char *Layout; + int ListOperand; + bool HasLane; + int NaturalOffset; +}; + +static const LdStNInstrDesc LdStNInstInfo[] = { + { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, + { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, + { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, + { AArch64::LD1i64, "ld1", ".d", 1, true, 0 }, + { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 }, + { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 }, + { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 }, + { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 }, + { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 }, + { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 }, + { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 }, + { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 }, + { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 }, + { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 }, + { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 }, + { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 }, + { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 }, + { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 }, + { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 }, + { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 }, + { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 }, + { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 }, + { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 }, + { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 }, + { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 }, + { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 }, + { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 }, + { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 }, + { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 }, + { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 }, + { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 }, + { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 }, + { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 }, + { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 }, + { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 }, + { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 }, + { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 }, + { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 }, + { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 }, + { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 }, + { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 }, + { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 }, + { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 }, + { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 }, + { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 }, + { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 }, + { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 }, + { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 }, + { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 }, + { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 }, + { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 }, + { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 }, + { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 }, + { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 }, + { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 }, + { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 }, + { AArch64::LD2i8, "ld2", ".b", 1, true, 0 }, + { AArch64::LD2i16, "ld2", ".h", 1, true, 0 }, + { AArch64::LD2i32, "ld2", ".s", 1, true, 0 }, + { AArch64::LD2i64, "ld2", ".d", 1, true, 0 }, + { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 }, + { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 }, + { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 }, + { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 }, + { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 }, + { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 }, + { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 }, + { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 }, + { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 }, + { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 }, + { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 }, + { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 }, + { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 }, + { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 }, + { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 }, + { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 }, + { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 }, + { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 }, + { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 }, + { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 }, + { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 }, + { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 }, + { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 }, + { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 }, + { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 }, + { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 }, + { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 }, + { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 }, + { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 }, + { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 }, + { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 }, + { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 }, + { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 }, + { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 }, + { AArch64::LD3i8, "ld3", ".b", 1, true, 0 }, + { AArch64::LD3i16, "ld3", ".h", 1, true, 0 }, + { AArch64::LD3i32, "ld3", ".s", 1, true, 0 }, + { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, + { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, + { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, + { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, + { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, + { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, + { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, + { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, + { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 }, + { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 }, + { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 }, + { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 }, + { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 }, + { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 }, + { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 }, + { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 }, + { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 }, + { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 }, + { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 }, + { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 }, + { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 }, + { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 }, + { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 }, + { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 }, + { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 }, + { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 }, + { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 }, + { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 }, + { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 }, + { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 }, + { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 }, + { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 }, + { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 }, + { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 }, + { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 }, + { AArch64::LD4i8, "ld4", ".b", 1, true, 0 }, + { AArch64::LD4i16, "ld4", ".h", 1, true, 0 }, + { AArch64::LD4i32, "ld4", ".s", 1, true, 0 }, + { AArch64::LD4i64, "ld4", ".d", 1, true, 0 }, + { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 }, + { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 }, + { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 }, + { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 }, + { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 }, + { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 }, + { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 }, + { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 }, + { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 }, + { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 }, + { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 }, + { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 }, + { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 }, + { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 }, + { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 }, + { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 }, + { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 }, + { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 }, + { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 }, + { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 }, + { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 }, + { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 }, + { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 }, + { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 }, + { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 }, + { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 }, + { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 }, + { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 }, + { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 }, + { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 }, + { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 }, + { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 }, + { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 }, + { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 }, + { AArch64::ST1i8, "st1", ".b", 0, true, 0 }, + { AArch64::ST1i16, "st1", ".h", 0, true, 0 }, + { AArch64::ST1i32, "st1", ".s", 0, true, 0 }, + { AArch64::ST1i64, "st1", ".d", 0, true, 0 }, + { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 }, + { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 }, + { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 }, + { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 }, + { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 }, + { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 }, + { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 }, + { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 }, + { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 }, + { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 }, + { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 }, + { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 }, + { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 }, + { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 }, + { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 }, + { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 }, + { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 }, + { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 }, + { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 }, + { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 }, + { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 }, + { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 }, + { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 }, + { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 }, + { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 }, + { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 }, + { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 }, + { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 }, + { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 }, + { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 }, + { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 }, + { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 }, + { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 }, + { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 }, + { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 }, + { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 }, + { AArch64::ST2i8, "st2", ".b", 0, true, 0 }, + { AArch64::ST2i16, "st2", ".h", 0, true, 0 }, + { AArch64::ST2i32, "st2", ".s", 0, true, 0 }, + { AArch64::ST2i64, "st2", ".d", 0, true, 0 }, + { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 }, + { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 }, + { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 }, + { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 }, + { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 }, + { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 }, + { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 }, + { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 }, + { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 }, + { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 }, + { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 }, + { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 }, + { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 }, + { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 }, + { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 }, + { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 }, + { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 }, + { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 }, + { AArch64::ST3i8, "st3", ".b", 0, true, 0 }, + { AArch64::ST3i16, "st3", ".h", 0, true, 0 }, + { AArch64::ST3i32, "st3", ".s", 0, true, 0 }, + { AArch64::ST3i64, "st3", ".d", 0, true, 0 }, + { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 }, + { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 }, + { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 }, + { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 }, + { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 }, + { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 }, + { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 }, + { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 }, + { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 }, + { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 }, + { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 }, + { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 }, + { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 }, + { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 }, + { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 }, + { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 }, + { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 }, + { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 }, + { AArch64::ST4i8, "st4", ".b", 0, true, 0 }, + { AArch64::ST4i16, "st4", ".h", 0, true, 0 }, + { AArch64::ST4i32, "st4", ".s", 0, true, 0 }, + { AArch64::ST4i64, "st4", ".d", 0, true, 0 }, + { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 }, + { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 }, + { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 }, + { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 }, + { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 }, + { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 }, + { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 }, + { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 }, + { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 }, + { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 }, + { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 }, + { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 }, + { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 }, + { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 }, + { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 }, + { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 }, + { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 }, + { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, +}; + +static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { + unsigned Idx; + for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) + if (LdStNInstInfo[Idx].Opcode == Opcode) + return &LdStNInstInfo[Idx]; + + return nullptr; +} + +void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, + const MCSubtargetInfo &STI) { + unsigned Opcode = MI->getOpcode(); + StringRef Layout, Mnemonic; + + bool IsTbx; + if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { + O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t' + << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", "; + + unsigned ListOpNum = IsTbx ? 2 : 1; + printVectorList(MI, ListOpNum, STI, O, ""); + + O << ", " + << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg); + printAnnotation(O, Annot); + return; + } + + if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; + + // Now onto the operands: first a vector list with possible lane + // specifier. E.g. { v0 }[2] + int OpNum = LdStDesc->ListOperand; + printVectorList(MI, OpNum++, STI, O, ""); + + if (LdStDesc->HasLane) + O << '[' << MI->getOperand(OpNum++).getImm() << ']'; + + // Next the address: [xN] + unsigned AddrReg = MI->getOperand(OpNum++).getReg(); + O << ", [" << getRegisterName(AddrReg) << ']'; + + // Finally, there might be a post-indexed offset. + if (LdStDesc->NaturalOffset != 0) { + unsigned Reg = MI->getOperand(OpNum++).getReg(); + if (Reg != AArch64::XZR) + O << ", " << getRegisterName(Reg); + else { + assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?"); + O << ", #" << LdStDesc->NaturalOffset; + } + } + + printAnnotation(O, Annot); + return; + } + + AArch64InstPrinter::printInst(MI, O, Annot, STI); +} + +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { +#ifndef NDEBUG + unsigned Opcode = MI->getOpcode(); + assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); +#endif + + const char *Asm = nullptr; + const MCOperand &Op1 = MI->getOperand(0); + const MCOperand &Cn = MI->getOperand(1); + const MCOperand &Cm = MI->getOperand(2); + const MCOperand &Op2 = MI->getOperand(3); + + unsigned Op1Val = Op1.getImm(); + unsigned CnVal = Cn.getImm(); + unsigned CmVal = Cm.getImm(); + unsigned Op2Val = Op2.getImm(); + + if (CnVal == 7) { + switch (CmVal) { + default: + break; + + // IC aliases + case 1: + if (Op1Val == 0 && Op2Val == 0) + Asm = "ic\tialluis"; + break; + case 5: + if (Op1Val == 0 && Op2Val == 0) + Asm = "ic\tiallu"; + else if (Op1Val == 3 && Op2Val == 1) + Asm = "ic\tivau"; + break; + + // DC aliases + case 4: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tzva"; + break; + case 6: + if (Op1Val == 0 && Op2Val == 1) + Asm = "dc\tivac"; + if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tisw"; + break; + case 10: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcvac"; + else if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tcsw"; + break; + case 11: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcvau"; + break; + case 12: + if (Op1Val == 3 && Op2Val == 1 && + (STI.getFeatureBits()[AArch64::HasV8_2aOps])) + Asm = "dc\tcvap"; + break; + case 14: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcivac"; + else if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tcisw"; + break; + + // AT aliases + case 8: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1r"; break; + case 1: Asm = "at\ts1e1w"; break; + case 2: Asm = "at\ts1e0r"; break; + case 3: Asm = "at\ts1e0w"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e2r"; break; + case 1: Asm = "at\ts1e2w"; break; + case 4: Asm = "at\ts12e1r"; break; + case 5: Asm = "at\ts12e1w"; break; + case 6: Asm = "at\ts12e0r"; break; + case 7: Asm = "at\ts12e0w"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e3r"; break; + case 1: Asm = "at\ts1e3w"; break; + } + break; + } + break; + case 9: + switch (Op1Val) { + default: + break; + case 0: + if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1rp"; break; + case 1: Asm = "at\ts1e1wp"; break; + } + } + break; + } + } + } else if (CnVal == 8) { + // TLBI aliases + switch (CmVal) { + default: + break; + case 3: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\tvmalle1is"; break; + case 1: Asm = "tlbi\tvae1is"; break; + case 2: Asm = "tlbi\taside1is"; break; + case 3: Asm = "tlbi\tvaae1is"; break; + case 5: Asm = "tlbi\tvale1is"; break; + case 7: Asm = "tlbi\tvaale1is"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle2is"; break; + case 1: Asm = "tlbi\tvae2is"; break; + case 4: Asm = "tlbi\talle1is"; break; + case 5: Asm = "tlbi\tvale2is"; break; + case 6: Asm = "tlbi\tvmalls12e1is"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle3is"; break; + case 1: Asm = "tlbi\tvae3is"; break; + case 5: Asm = "tlbi\tvale3is"; break; + } + break; + } + break; + case 0: + switch (Op1Val) { + default: + break; + case 4: + switch (Op2Val) { + default: + break; + case 1: Asm = "tlbi\tipas2e1is"; break; + case 5: Asm = "tlbi\tipas2le1is"; break; + } + break; + } + break; + case 4: + switch (Op1Val) { + default: + break; + case 4: + switch (Op2Val) { + default: + break; + case 1: Asm = "tlbi\tipas2e1"; break; + case 5: Asm = "tlbi\tipas2le1"; break; + } + break; + } + break; + case 7: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\tvmalle1"; break; + case 1: Asm = "tlbi\tvae1"; break; + case 2: Asm = "tlbi\taside1"; break; + case 3: Asm = "tlbi\tvaae1"; break; + case 5: Asm = "tlbi\tvale1"; break; + case 7: Asm = "tlbi\tvaale1"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle2"; break; + case 1: Asm = "tlbi\tvae2"; break; + case 4: Asm = "tlbi\talle1"; break; + case 5: Asm = "tlbi\tvale2"; break; + case 6: Asm = "tlbi\tvmalls12e1"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle3"; break; + case 1: Asm = "tlbi\tvae3"; break; + case 5: Asm = "tlbi\tvale3"; break; + } + break; + } + break; + } + } + + if (Asm) { + unsigned Reg = MI->getOperand(4).getReg(); + + O << '\t' << Asm; + if (StringRef(Asm).lower().find("all") == StringRef::npos) + O << ", " << getRegisterName(Reg); + } + + return Asm != nullptr; +} + +void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg); + } else if (Op.isImm()) { + O << '#' << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << format("#%#llx", Op.getImm()); +} + +void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, + unsigned Imm, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Reg == AArch64::XZR) + O << "#" << Imm; + else + O << getRegisterName(Reg); + } else + llvm_unreachable("unknown operand kind in printPostIncOperand64"); +} + +void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isReg() && "Non-register vreg operand!"); + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg, AArch64::vreg); +} + +void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() && "System instruction C[nm] operands must be immediates!"); + O << "c" << Op.getImm(); +} + +void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.isImm()) { + unsigned Val = (MO.getImm() & 0xfff); + assert(Val == MO.getImm() && "Add/sub immediate out of range!"); + unsigned Shift = + AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); + O << '#' << Val; + if (Shift != 0) + printShifter(MI, OpNum + 1, STI, O); + + if (CommentStream) + *CommentStream << '=' << (Val << Shift) << '\n'; + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + MO.getExpr()->print(O, &MAI); + printShifter(MI, OpNum + 1, STI, O); + } +} + +void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint64_t Val = MI->getOperand(OpNum).getImm(); + O << "#0x"; + O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32)); +} + +void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint64_t Val = MI->getOperand(OpNum).getImm(); + O << "#0x"; + O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64)); +} + +void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + // LSL #0 should not be printed. + if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL && + AArch64_AM::getShiftValue(Val) == 0) + return; + O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val)) + << " #" << AArch64_AM::getShiftValue(Val); +} + +void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printShifter(MI, OpNum + 1, STI, O); +} + +void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printArithExtend(MI, OpNum + 1, STI, O); +} + +void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val); + unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val); + + // If the destination or first source register operand is [W]SP, print + // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at + // all. + if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) { + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) && + ExtType == AArch64_AM::UXTX) || + ((Dest == AArch64::WSP || Src1 == AArch64::WSP) && + ExtType == AArch64_AM::UXTW) ) { + if (ShiftVal != 0) + O << ", lsl #" << ShiftVal; + return; + } + } + O << ", " << AArch64_AM::getShiftExtendName(ExtType); + if (ShiftVal != 0) + O << " #" << ShiftVal; +} + +void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum, + raw_ostream &O, char SrcRegKind, + unsigned Width) { + unsigned SignExtend = MI->getOperand(OpNum).getImm(); + unsigned DoShift = MI->getOperand(OpNum + 1).getImm(); + + // sxtw, sxtx, uxtw or lsl (== uxtx) + bool IsLSL = !SignExtend && SrcRegKind == 'x'; + if (IsLSL) + O << "lsl"; + else + O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind; + + if (DoShift || IsLSL) + O << " #" << Log2_32(Width / 8); +} + +void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(CC); +} + +void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC)); +} + +void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']'; +} + +template<int Scale> +void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << '#' << Scale * MI->getOperand(OpNum).getImm(); +} + +void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO = MI->getOperand(OpNum); + if (MO.isImm()) { + O << "#" << (MO.getImm() * Scale); + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + MO.getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO1 = MI->getOperand(OpNum + 1); + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MO1.isImm()) { + O << ", #" << (MO1.getImm() * Scale); + } else { + assert(MO1.isExpr() && "Unexpected operand type!"); + O << ", "; + MO1.getExpr()->print(O, &MAI); + } + O << ']'; +} + +void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned prfop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << prfop; +} + +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + +void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + float FPImm = + MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm()); + + // 8 decimal places are enough to perfectly represent permitted floats. + O << format("#%.8f", FPImm); +} + +static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { + while (Stride--) { + switch (Reg) { + default: + llvm_unreachable("Vector register expected!"); + case AArch64::Q0: Reg = AArch64::Q1; break; + case AArch64::Q1: Reg = AArch64::Q2; break; + case AArch64::Q2: Reg = AArch64::Q3; break; + case AArch64::Q3: Reg = AArch64::Q4; break; + case AArch64::Q4: Reg = AArch64::Q5; break; + case AArch64::Q5: Reg = AArch64::Q6; break; + case AArch64::Q6: Reg = AArch64::Q7; break; + case AArch64::Q7: Reg = AArch64::Q8; break; + case AArch64::Q8: Reg = AArch64::Q9; break; + case AArch64::Q9: Reg = AArch64::Q10; break; + case AArch64::Q10: Reg = AArch64::Q11; break; + case AArch64::Q11: Reg = AArch64::Q12; break; + case AArch64::Q12: Reg = AArch64::Q13; break; + case AArch64::Q13: Reg = AArch64::Q14; break; + case AArch64::Q14: Reg = AArch64::Q15; break; + case AArch64::Q15: Reg = AArch64::Q16; break; + case AArch64::Q16: Reg = AArch64::Q17; break; + case AArch64::Q17: Reg = AArch64::Q18; break; + case AArch64::Q18: Reg = AArch64::Q19; break; + case AArch64::Q19: Reg = AArch64::Q20; break; + case AArch64::Q20: Reg = AArch64::Q21; break; + case AArch64::Q21: Reg = AArch64::Q22; break; + case AArch64::Q22: Reg = AArch64::Q23; break; + case AArch64::Q23: Reg = AArch64::Q24; break; + case AArch64::Q24: Reg = AArch64::Q25; break; + case AArch64::Q25: Reg = AArch64::Q26; break; + case AArch64::Q26: Reg = AArch64::Q27; break; + case AArch64::Q27: Reg = AArch64::Q28; break; + case AArch64::Q28: Reg = AArch64::Q29; break; + case AArch64::Q29: Reg = AArch64::Q30; break; + case AArch64::Q30: Reg = AArch64::Q31; break; + // Vector lists can wrap around. + case AArch64::Q31: + Reg = AArch64::Q0; + break; + } + } + return Reg; +} + +template<unsigned size> +void AArch64InstPrinter::printGPRSeqPairsClassOperand(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + static_assert(size == 64 || size == 32, + "Template parameter must be either 32 or 64"); + unsigned Reg = MI->getOperand(OpNum).getReg(); + + unsigned Sube = (size == 32) ? AArch64::sube32 : AArch64::sube64; + unsigned Subo = (size == 32) ? AArch64::subo32 : AArch64::subo64; + + unsigned Even = MRI.getSubReg(Reg, Sube); + unsigned Odd = MRI.getSubReg(Reg, Subo); + O << getRegisterName(Even) << ", " << getRegisterName(Odd); +} + +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O, + StringRef LayoutSuffix) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + + O << "{ "; + + // Work out how many registers there are in the list (if there is an actual + // list). + unsigned NumRegs = 1; + if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQRegClassID).contains(Reg)) + NumRegs = 2; + else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg)) + NumRegs = 3; + else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg)) + NumRegs = 4; + + // Now forget about the list and find out what the first register is. + if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) + Reg = FirstReg; + else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) + Reg = FirstReg; + + // If it's a D-reg, we need to promote it to the equivalent Q-reg before + // printing (otherwise getRegisterName fails). + if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) { + const MCRegisterClass &FPR128RC = + MRI.getRegClass(AArch64::FPR128RegClassID); + Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC); + } + + for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) { + O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix; + if (i + 1 != NumRegs) + O << ", "; + } + + O << " }"; +} + +void +AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI, + unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + printVectorList(MI, OpNum, STI, O, ""); +} + +template <unsigned NumLanes, char LaneKind> +void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + std::string Suffix("."); + if (NumLanes) + Suffix += itostr(NumLanes) + LaneKind; + else + Suffix += LaneKind; + + printVectorList(MI, OpNum, STI, O, Suffix); +} + +void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} + +void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << (Op.getImm() * 4); + return; + } + + // If the branch target is simply an address then print it in hex. + const MCConstantExpr *BranchTarget = + dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } else { + // Otherwise, just print the expression. + MI->getOperand(OpNum).getExpr()->print(O, &MAI); + } +} + +void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << (Op.getImm() * (1 << 12)); + return; + } + + // Otherwise, just print the expression. + MI->getOperand(OpNum).getExpr()->print(O, &MAI); +} + +void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + unsigned Opcode = MI->getOpcode(); + + bool Valid; + StringRef Name; + if (Opcode == AArch64::ISB) + Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(), + Valid); + else + Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(), + Valid); + if (Valid) + O << Name; + else + O << "#" << Val; +} + +void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + auto Mapper = AArch64SysReg::MRSMapper(); + std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + + O << StringRef(Name).upper(); +} + +void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + auto Mapper = AArch64SysReg::MSRMapper(); + std::string Name = Mapper.toString(Val, STI.getFeatureBits()); + + O << StringRef(Name).upper(); +} + +void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + bool Valid; + StringRef Name = + AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid); + if (Valid) + O << Name.upper(); + else + O << "#" << Val; +} + +void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned RawVal = MI->getOperand(OpNo).getImm(); + uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal); + O << format("#%#016llx", Val); +} diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h new file mode 100644 index 0000000..ea68d98 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -0,0 +1,186 @@ +//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H +#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H + +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class AArch64InstPrinter : public MCInstPrinter { +public: + AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + + // Autogenerated by tblgen. + virtual void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, + raw_ostream &O); + virtual StringRef getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); + } + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); + +protected: + bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); + // Operand printers + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); + void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, + raw_ostream &O); + template <int Amount> + void printPostIncOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printPostIncOperand(MI, OpNo, Amount, O); + } + + void printVRegOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSysCROperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAddSubImm(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printLogicalImm32(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printLogicalImm64(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShifter(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printShiftedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printExtendedRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printArithExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O, + char SrcRegKind, unsigned Width); + template <char SrcRegKind, unsigned Width> + void printMemExtend(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printMemExtend(MI, OpNum, O, SrcRegKind, Width); + } + + void printCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printInverseCondCode(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAlignedLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + + template <int Scale> + void printUImm12Offset(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printUImm12Offset(MI, OpNum, Scale, O); + } + + template <int BitWidth> + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printAMIndexedWB(MI, OpNum, BitWidth / 8, O); + } + + void printAMNoIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + template <int Scale> + void printImmScale(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printPrefetchOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printFPImmOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O, + StringRef LayoutSuffix); + + /// Print a list of vector registers where the type suffix is implicit + /// (i.e. attached to the instruction rather than the registers). + void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); + + template <unsigned NumLanes, char LaneKind> + void printTypedVectorList(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + + void printVectorIndex(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printAdrpLabel(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printBarrierOption(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSystemPStateField(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + template<unsigned size> + void printGPRSeqPairsClassOperand(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O); +}; + +class AArch64AppleInstPrinter : public AArch64InstPrinter { +public: + AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + + void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O) override; + bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O) override; + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + const MCSubtargetInfo &STI, + raw_ostream &O) override; + StringRef getRegName(unsigned RegNo) const override { + return getRegisterName(RegNo); + } + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h new file mode 100644 index 0000000..648b1df --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -0,0 +1,760 @@ +//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> + +namespace llvm { + +/// AArch64_AM - AArch64 Addressing Mode Stuff +namespace AArch64_AM { + +//===----------------------------------------------------------------------===// +// Shifts +// + +enum ShiftExtendType { + InvalidShiftExtend = -1, + LSL = 0, + LSR, + ASR, + ROR, + MSL, + + UXTB, + UXTH, + UXTW, + UXTX, + + SXTB, + SXTH, + SXTW, + SXTX, +}; + +/// getShiftName - Get the string encoding for the shift type. +static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) { + switch (ST) { + default: llvm_unreachable("unhandled shift type!"); + case AArch64_AM::LSL: return "lsl"; + case AArch64_AM::LSR: return "lsr"; + case AArch64_AM::ASR: return "asr"; + case AArch64_AM::ROR: return "ror"; + case AArch64_AM::MSL: return "msl"; + case AArch64_AM::UXTB: return "uxtb"; + case AArch64_AM::UXTH: return "uxth"; + case AArch64_AM::UXTW: return "uxtw"; + case AArch64_AM::UXTX: return "uxtx"; + case AArch64_AM::SXTB: return "sxtb"; + case AArch64_AM::SXTH: return "sxth"; + case AArch64_AM::SXTW: return "sxtw"; + case AArch64_AM::SXTX: return "sxtx"; + } + return nullptr; +} + +/// getShiftType - Extract the shift type. +static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) { + switch ((Imm >> 6) & 0x7) { + default: return AArch64_AM::InvalidShiftExtend; + case 0: return AArch64_AM::LSL; + case 1: return AArch64_AM::LSR; + case 2: return AArch64_AM::ASR; + case 3: return AArch64_AM::ROR; + case 4: return AArch64_AM::MSL; + } +} + +/// getShiftValue - Extract the shift value. +static inline unsigned getShiftValue(unsigned Imm) { + return Imm & 0x3f; +} + +/// getShifterImm - Encode the shift type and amount: +/// imm: 6-bit shift amount +/// shifter: 000 ==> lsl +/// 001 ==> lsr +/// 010 ==> asr +/// 011 ==> ror +/// 100 ==> msl +/// {8-6} = shifter +/// {5-0} = imm +static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, + unsigned Imm) { + assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!"); + unsigned STEnc = 0; + switch (ST) { + default: llvm_unreachable("Invalid shift requested"); + case AArch64_AM::LSL: STEnc = 0; break; + case AArch64_AM::LSR: STEnc = 1; break; + case AArch64_AM::ASR: STEnc = 2; break; + case AArch64_AM::ROR: STEnc = 3; break; + case AArch64_AM::MSL: STEnc = 4; break; + } + return (STEnc << 6) | (Imm & 0x3f); +} + +//===----------------------------------------------------------------------===// +// Extends +// + +/// getArithShiftValue - get the arithmetic shift value. +static inline unsigned getArithShiftValue(unsigned Imm) { + return Imm & 0x7; +} + +/// getExtendType - Extract the extend type for operands of arithmetic ops. +static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) { + assert((Imm & 0x7) == Imm && "invalid immediate!"); + switch (Imm) { + default: llvm_unreachable("Compiler bug!"); + case 0: return AArch64_AM::UXTB; + case 1: return AArch64_AM::UXTH; + case 2: return AArch64_AM::UXTW; + case 3: return AArch64_AM::UXTX; + case 4: return AArch64_AM::SXTB; + case 5: return AArch64_AM::SXTH; + case 6: return AArch64_AM::SXTW; + case 7: return AArch64_AM::SXTX; + } +} + +static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) { + return getExtendType((Imm >> 3) & 0x7); +} + +/// Mapping from extend bits to required operation: +/// shifter: 000 ==> uxtb +/// 001 ==> uxth +/// 010 ==> uxtw +/// 011 ==> uxtx +/// 100 ==> sxtb +/// 101 ==> sxth +/// 110 ==> sxtw +/// 111 ==> sxtx +inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) { + switch (ET) { + default: llvm_unreachable("Invalid extend type requested"); + case AArch64_AM::UXTB: return 0; break; + case AArch64_AM::UXTH: return 1; break; + case AArch64_AM::UXTW: return 2; break; + case AArch64_AM::UXTX: return 3; break; + case AArch64_AM::SXTB: return 4; break; + case AArch64_AM::SXTH: return 5; break; + case AArch64_AM::SXTW: return 6; break; + case AArch64_AM::SXTX: return 7; break; + } +} + +/// getArithExtendImm - Encode the extend type and shift amount for an +/// arithmetic instruction: +/// imm: 3-bit extend amount +/// {5-3} = shifter +/// {2-0} = imm3 +static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, + unsigned Imm) { + assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!"); + return (getExtendEncoding(ET) << 3) | (Imm & 0x7); +} + +/// getMemDoShift - Extract the "do shift" flag value for load/store +/// instructions. +static inline bool getMemDoShift(unsigned Imm) { + return (Imm & 0x1) != 0; +} + +/// getExtendType - Extract the extend type for the offset operand of +/// loads/stores. +static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) { + return getExtendType((Imm >> 1) & 0x7); +} + +/// getExtendImm - Encode the extend type and amount for a load/store inst: +/// doshift: should the offset be scaled by the access size +/// shifter: 000 ==> uxtb +/// 001 ==> uxth +/// 010 ==> uxtw +/// 011 ==> uxtx +/// 100 ==> sxtb +/// 101 ==> sxth +/// 110 ==> sxtw +/// 111 ==> sxtx +/// {3-1} = shifter +/// {0} = doshift +static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET, + bool DoShift) { + return (getExtendEncoding(ET) << 1) | unsigned(DoShift); +} + +static inline uint64_t ror(uint64_t elt, unsigned size) { + return ((elt & 1) << (size-1)) | (elt >> 1); +} + +/// processLogicalImmediate - Determine if an immediate value can be encoded +/// as the immediate operand of a logical instruction for the given register +/// size. If so, return true with "encoding" set to the encoded value in +/// the form N:immr:imms. +static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize, + uint64_t &Encoding) { + if (Imm == 0ULL || Imm == ~0ULL || + (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U))) + return false; + + // First, determine the element size. + unsigned Size = RegSize; + + do { + Size /= 2; + uint64_t Mask = (1ULL << Size) - 1; + + if ((Imm & Mask) != ((Imm >> Size) & Mask)) { + Size *= 2; + break; + } + } while (Size > 2); + + // Second, determine the rotation to make the element be: 0^m 1^n. + uint32_t CTO, I; + uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size); + Imm &= Mask; + + if (isShiftedMask_64(Imm)) { + I = countTrailingZeros(Imm); + assert(I < 64 && "undefined behavior"); + CTO = countTrailingOnes(Imm >> I); + } else { + Imm |= ~Mask; + if (!isShiftedMask_64(~Imm)) + return false; + + unsigned CLO = countLeadingOnes(Imm); + I = 64 - CLO; + CTO = CLO + countTrailingOnes(Imm) - (64 - Size); + } + + // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n + // to our target value, where I is the number of RORs to go the opposite + // direction. + assert(Size > I && "I should be smaller than element size"); + unsigned Immr = (Size - I) & (Size - 1); + + // If size has a 1 in the n'th bit, create a value that has zeroes in + // bits [0, n] and ones above that. + uint64_t NImms = ~(Size-1) << 1; + + // Or the CTO value into the low bits, which must be below the Nth bit + // bit mentioned above. + NImms |= (CTO-1); + + // Extract the seventh bit and toggle it to create the N field. + unsigned N = ((NImms >> 6) & 1) ^ 1; + + Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f); + return true; +} + +/// isLogicalImmediate - Return true if the immediate is valid for a logical +/// immediate instruction of the given register size. Return false otherwise. +static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) { + uint64_t encoding; + return processLogicalImmediate(imm, regSize, encoding); +} + +/// encodeLogicalImmediate - Return the encoded immediate value for a logical +/// immediate instruction of the given register size. +static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) { + uint64_t encoding = 0; + bool res = processLogicalImmediate(imm, regSize, encoding); + assert(res && "invalid logical immediate"); + (void)res; + return encoding; +} + +/// decodeLogicalImmediate - Decode a logical immediate value in the form +/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the +/// integer value it represents with regSize bits. +static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) { + // Extract the N, imms, and immr fields. + unsigned N = (val >> 12) & 1; + unsigned immr = (val >> 6) & 0x3f; + unsigned imms = val & 0x3f; + + assert((regSize == 64 || N == 0) && "undefined logical immediate encoding"); + int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); + assert(len >= 0 && "undefined logical immediate encoding"); + unsigned size = (1 << len); + unsigned R = immr & (size - 1); + unsigned S = imms & (size - 1); + assert(S != size - 1 && "undefined logical immediate encoding"); + uint64_t pattern = (1ULL << (S + 1)) - 1; + for (unsigned i = 0; i < R; ++i) + pattern = ror(pattern, size); + + // Replicate the pattern to fill the regSize. + while (size != regSize) { + pattern |= (pattern << size); + size *= 2; + } + return pattern; +} + +/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value +/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits) +/// is a valid encoding for an integer value with regSize bits. +static inline bool isValidDecodeLogicalImmediate(uint64_t val, + unsigned regSize) { + // Extract the N and imms fields needed for checking. + unsigned N = (val >> 12) & 1; + unsigned imms = val & 0x3f; + + if (regSize == 32 && N != 0) // undefined logical immediate encoding + return false; + int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); + if (len < 0) // undefined logical immediate encoding + return false; + unsigned size = (1 << len); + unsigned S = imms & (size - 1); + if (S == size - 1) // undefined logical immediate encoding + return false; + + return true; +} + +//===----------------------------------------------------------------------===// +// Floating-point Immediates +// +static inline float getFPImmFloat(unsigned Imm) { + // We expect an 8-bit binary encoding of a floating-point number here. + union { + uint32_t I; + float F; + } FPUnion; + + uint8_t Sign = (Imm >> 7) & 0x1; + uint8_t Exp = (Imm >> 4) & 0x7; + uint8_t Mantissa = Imm & 0xf; + + // 8-bit FP iEEEE Float Encoding + // abcd efgh aBbbbbbc defgh000 00000000 00000000 + // + // where B = NOT(b); + + FPUnion.I = 0; + FPUnion.I |= Sign << 31; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; + FPUnion.I |= (Exp & 0x3) << 23; + FPUnion.I |= Mantissa << 19; + return FPUnion.F; +} + +/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP16Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 + int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x3f) + return -1; + Mantissa >>= 6; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP16Imm(const APFloat &FPImm) { + return getFP16Imm(FPImm.bitcastToAPInt()); +} + +/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP32Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 + int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x7ffff) + return -1; + Mantissa >>= 19; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP32Imm(const APFloat &FPImm) { + return getFP32Imm(FPImm.bitcastToAPInt()); +} + +/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP64Imm(const APInt &Imm) { + uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; + int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 + uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0xffffffffffffULL) + return -1; + Mantissa >>= 48; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP64Imm(const APFloat &FPImm) { + return getFP64Imm(FPImm.bitcastToAPInt()); +} + +//===--------------------------------------------------------------------===// +// AdvSIMD Modified Immediates +//===--------------------------------------------------------------------===// + +// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh +static inline bool isAdvSIMDModImmType1(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffffff00ffffff00ULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 32) | EncVal; +} + +// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 +static inline bool isAdvSIMDModImmType2(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffff00ffffff00ffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 40) | (EncVal << 8); +} + +// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 +static inline bool isAdvSIMDModImmType3(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xff00ffffff00ffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) { + return (Imm & 0xff0000ULL) >> 16; +} + +static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 16); +} + +// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 +static inline bool isAdvSIMDModImmType4(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0x00ffffff00ffffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) { + return (Imm & 0xff000000ULL) >> 24; +} + +static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 56) | (EncVal << 24); +} + +// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh +static inline bool isAdvSIMDModImmType5(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) && + ((Imm & 0xff00ff00ff00ff00ULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal; +} + +// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 +static inline bool isAdvSIMDModImmType6(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) && + ((Imm & 0x00ff00ff00ff00ffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8); +} + +// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF +static inline bool isAdvSIMDModImmType7(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL); +} + +static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL; +} + +// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF +static inline bool isAdvSIMDModImmType8(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL; +} + +static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) { + return (Imm & 0x00ff0000ULL) >> 16; +} + +// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh +static inline bool isAdvSIMDModImmType9(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm >> 48) == (Imm & 0x0000ffffULL)) && + ((Imm >> 56) == (Imm & 0x000000ffULL)); +} + +static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) { + uint64_t EncVal = Imm; + EncVal |= (EncVal << 8); + EncVal |= (EncVal << 16); + EncVal |= (EncVal << 32); + return EncVal; +} + +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// cmode: 1110, op: 1 +static inline bool isAdvSIMDModImmType10(uint64_t Imm) { + uint64_t ByteA = Imm & 0xff00000000000000ULL; + uint64_t ByteB = Imm & 0x00ff000000000000ULL; + uint64_t ByteC = Imm & 0x0000ff0000000000ULL; + uint64_t ByteD = Imm & 0x000000ff00000000ULL; + uint64_t ByteE = Imm & 0x00000000ff000000ULL; + uint64_t ByteF = Imm & 0x0000000000ff0000ULL; + uint64_t ByteG = Imm & 0x000000000000ff00ULL; + uint64_t ByteH = Imm & 0x00000000000000ffULL; + + return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) && + (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) && + (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) && + (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) && + (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) && + (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) && + (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) && + (ByteH == 0ULL || ByteH == 0x00000000000000ffULL); +} + +static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) { + uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0; + uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0; + uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0; + uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0; + uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0; + uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0; + uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0; + uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0xff00000000000000ULL; + if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL; + if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL; + if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL; + if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL; + if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL; + if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL; + if (Imm & 0x01) EncVal |= 0x00000000000000ffULL; + return EncVal; +} + +// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00 +static inline bool isAdvSIMDModImmType11(uint64_t Imm) { + uint64_t BString = (Imm & 0x7E000000ULL) >> 25; + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (BString == 0x1f || BString == 0x20) && + ((Imm & 0x0007ffff0007ffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) { + uint8_t BitA = (Imm & 0x80000000ULL) != 0; + uint8_t BitB = (Imm & 0x20000000ULL) != 0; + uint8_t BitC = (Imm & 0x01000000ULL) != 0; + uint8_t BitD = (Imm & 0x00800000ULL) != 0; + uint8_t BitE = (Imm & 0x00400000ULL) != 0; + uint8_t BitF = (Imm & 0x00200000ULL) != 0; + uint8_t BitG = (Imm & 0x00100000ULL) != 0; + uint8_t BitH = (Imm & 0x00080000ULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0x80000000ULL; + if (Imm & 0x40) EncVal |= 0x3e000000ULL; + else EncVal |= 0x40000000ULL; + if (Imm & 0x20) EncVal |= 0x01000000ULL; + if (Imm & 0x10) EncVal |= 0x00800000ULL; + if (Imm & 0x08) EncVal |= 0x00400000ULL; + if (Imm & 0x04) EncVal |= 0x00200000ULL; + if (Imm & 0x02) EncVal |= 0x00100000ULL; + if (Imm & 0x01) EncVal |= 0x00080000ULL; + return (EncVal << 32) | EncVal; +} + +// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00 +static inline bool isAdvSIMDModImmType12(uint64_t Imm) { + uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54; + return ((BString == 0xff || BString == 0x100) && + ((Imm & 0x0000ffffffffffffULL) == 0)); +} + +static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) { + uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0; + uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0; + uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0; + uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0; + uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0; + uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0; + uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0; + uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0x8000000000000000ULL; + if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL; + else EncVal |= 0x4000000000000000ULL; + if (Imm & 0x20) EncVal |= 0x0020000000000000ULL; + if (Imm & 0x10) EncVal |= 0x0010000000000000ULL; + if (Imm & 0x08) EncVal |= 0x0008000000000000ULL; + if (Imm & 0x04) EncVal |= 0x0004000000000000ULL; + if (Imm & 0x02) EncVal |= 0x0002000000000000ULL; + if (Imm & 0x01) EncVal |= 0x0001000000000000ULL; + return (EncVal << 32) | EncVal; +} + +} // end namespace AArch64_AM + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp new file mode 100644 index 0000000..7624c72 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -0,0 +1,543 @@ +//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +using namespace llvm; + +namespace { + +class AArch64AsmBackend : public MCAsmBackend { + static const unsigned PCRelFlagVal = + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; + +public: + AArch64AsmBackend(const Target &T) : MCAsmBackend() {} + + unsigned getNumFixupKinds() const override { + return AArch64::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { + // This table *must* be in the order that the fixup_* kinds are defined in + // AArch64FixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, + { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, + { "fixup_aarch64_add_imm12", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 }, + { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, + { "fixup_aarch64_movw", 5, 16, 0 }, + { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal }, + { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal }, + { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal }, + { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal }, + { "fixup_aarch64_tlsdesc_call", 0, 0, 0 } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + + bool mayNeedRelaxation(const MCInst &Inst) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override; + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + void HandleAssemblerFlag(MCAssemblerFlag Flag) {} + + unsigned getPointerSize() const { return 8; } +}; + +} // end anonymous namespace + +/// \brief The number of bytes the fixup may change. +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + + case AArch64::fixup_aarch64_tlsdesc_call: + return 0; + + case FK_Data_1: + return 1; + + case FK_Data_2: + case AArch64::fixup_aarch64_movw: + return 2; + + case AArch64::fixup_aarch64_pcrel_branch14: + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + return 3; + + case AArch64::fixup_aarch64_pcrel_adr_imm21: + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + case FK_Data_4: + return 4; + + case FK_Data_8: + return 8; + } +} + +static unsigned AdrImmBits(unsigned Value) { + unsigned lo2 = Value & 0x3; + unsigned hi19 = (Value & 0x1ffffc) >> 2; + return (hi19 << 5) | (lo2 << 29); +} + +static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { + int64_t SignedValue = static_cast<int64_t>(Value); + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case AArch64::fixup_aarch64_pcrel_adr_imm21: + if (SignedValue > 2097151 || SignedValue < -2097152) + report_fatal_error("fixup value out of range"); + return AdrImmBits(Value & 0x1fffffULL); + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + return AdrImmBits((Value & 0x1fffff000ULL) >> 12); + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + // Signed 21-bit immediate + if (SignedValue > 2097151 || SignedValue < -2097152) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded. + return (Value >> 2) & 0x7ffff; + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + // Unsigned 12-bit immediate + if (Value >= 0x1000) + report_fatal_error("invalid imm12 fixup value"); + return Value; + case AArch64::fixup_aarch64_ldst_imm12_scale2: + // Unsigned 12-bit immediate which gets multiplied by 2 + if (Value & 1 || Value >= 0x2000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 1; + case AArch64::fixup_aarch64_ldst_imm12_scale4: + // Unsigned 12-bit immediate which gets multiplied by 4 + if (Value & 3 || Value >= 0x4000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 2; + case AArch64::fixup_aarch64_ldst_imm12_scale8: + // Unsigned 12-bit immediate which gets multiplied by 8 + if (Value & 7 || Value >= 0x8000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 3; + case AArch64::fixup_aarch64_ldst_imm12_scale16: + // Unsigned 12-bit immediate which gets multiplied by 16 + if (Value & 15 || Value >= 0x10000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 4; + case AArch64::fixup_aarch64_movw: + report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet"); + return Value; + case AArch64::fixup_aarch64_pcrel_branch14: + // Signed 16-bit immediate + if (SignedValue > 32767 || SignedValue < -32768) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded (4-byte alignment assumed). + if (Value & 0x3) + report_fatal_error("fixup not sufficiently aligned"); + return (Value >> 2) & 0x3fff; + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + // Signed 28-bit immediate + if (SignedValue > 134217727 || SignedValue < -134217728) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded (4-byte alignment assumed). + if (Value & 0x3) + report_fatal_error("fixup not sufficiently aligned"); + return (Value >> 2) & 0x3ffffff; + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + return Value; + } +} + +void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + if (!Value) + return; // Doesn't change encoding. + MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + // Apply any target-specific value adjustments. + Value = adjustFixupValue(Fixup.getKind(), Value); + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { + return false; +} + +bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // FIXME: This isn't correct for AArch64. Just moving the "generic" logic + // into the targets for now. + // + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + +void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, + MCInst &Res) const { + llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); +} + +bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + // If the count is not 4-byte aligned, we must be writing data into the text + // section (otherwise we have unaligned instructions, and thus have far + // bigger problems), so just write zeros instead. + OW->WriteZeros(Count % 4); + + // We are properly aligned, so write NOPs as requested. + Count /= 4; + for (uint64_t i = 0; i != Count; ++i) + OW->write32(0xd503201f); + return true; +} + +namespace { + +namespace CU { + +/// \brief Compact unwind encoding values. +enum CompactUnwindEncodings { + /// \brief A "frameless" leaf function, where no non-volatile registers are + /// saved. The return remains in LR throughout the function. + UNWIND_AArch64_MODE_FRAMELESS = 0x02000000, + + /// \brief No compact unwind encoding available. Instead the low 23-bits of + /// the compact unwind encoding is the offset of the DWARF FDE in the + /// __eh_frame section. This mode is never used in object files. It is only + /// generated by the linker in final linked images, which have only DWARF info + /// for a function. + UNWIND_AArch64_MODE_DWARF = 0x03000000, + + /// \brief This is a standard arm64 prologue where FP/LR are immediately + /// pushed on the stack, then SP is copied to FP. If there are any + /// non-volatile register saved, they are copied into the stack fame in pairs + /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the + /// five X pairs and four D pairs can be saved, but the memory layout must be + /// in register number order. + UNWIND_AArch64_MODE_FRAME = 0x04000000, + + /// \brief Frame register pair encodings. + UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001, + UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002, + UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004, + UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008, + UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010, + UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100, + UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200, + UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400, + UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800 +}; + +} // end CU namespace + +// FIXME: This should be in a separate file. +class DarwinAArch64AsmBackend : public AArch64AsmBackend { + const MCRegisterInfo &MRI; + + /// \brief Encode compact unwind stack adjustment for frameless functions. + /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. + /// The stack size always needs to be 16 byte aligned. + uint32_t encodeStackAdjustment(uint32_t StackSize) const { + return (StackSize / 16) << 12; + } + +public: + DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI) + : AArch64AsmBackend(T), MRI(MRI) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, + MachO::CPU_SUBTYPE_ARM64_ALL); + } + + /// \brief Generate the compact unwind encoding from the CFI directives. + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override { + if (Instrs.empty()) + return CU::UNWIND_AArch64_MODE_FRAMELESS; + + bool HasFP = false; + unsigned StackSize = 0; + + uint32_t CompactUnwindEncoding = 0; + for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + const MCCFIInstruction &Inst = Instrs[i]; + + switch (Inst.getOperation()) { + default: + // Cannot handle this directive: bail out. + return CU::UNWIND_AArch64_MODE_DWARF; + case MCCFIInstruction::OpDefCfa: { + // Defines a frame pointer. + assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) == + AArch64::FP && + "Invalid frame pointer!"); + assert(i + 2 < e && "Insufficient CFI instructions to define a frame!"); + + const MCCFIInstruction &LRPush = Instrs[++i]; + assert(LRPush.getOperation() == MCCFIInstruction::OpOffset && + "Link register not pushed!"); + const MCCFIInstruction &FPPush = Instrs[++i]; + assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && + "Frame pointer not pushed!"); + + unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true); + unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true); + + LRReg = getXRegFromWReg(LRReg); + FPReg = getXRegFromWReg(FPReg); + + assert(LRReg == AArch64::LR && FPReg == AArch64::FP && + "Pushing invalid registers for frame!"); + + // Indicate that the function has a frame. + CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME; + HasFP = true; + break; + } + case MCCFIInstruction::OpDefCfaOffset: { + assert(StackSize == 0 && "We already have the CFA offset!"); + StackSize = std::abs(Inst.getOffset()); + break; + } + case MCCFIInstruction::OpOffset: { + // Registers are saved in pairs. We expect there to be two consecutive + // `.cfi_offset' instructions with the appropriate registers specified. + unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); + if (i + 1 == e) + return CU::UNWIND_AArch64_MODE_DWARF; + + const MCCFIInstruction &Inst2 = Instrs[++i]; + if (Inst2.getOperation() != MCCFIInstruction::OpOffset) + return CU::UNWIND_AArch64_MODE_DWARF; + unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); + + // N.B. The encodings must be in register number order, and the X + // registers before the D registers. + + // X19/X20 pair = 0x00000001, + // X21/X22 pair = 0x00000002, + // X23/X24 pair = 0x00000004, + // X25/X26 pair = 0x00000008, + // X27/X28 pair = 0x00000010 + Reg1 = getXRegFromWReg(Reg1); + Reg2 = getXRegFromWReg(Reg2); + + if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 && + (CompactUnwindEncoding & 0xF1E) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR; + else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 && + (CompactUnwindEncoding & 0xF1C) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR; + else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 && + (CompactUnwindEncoding & 0xF18) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR; + else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 && + (CompactUnwindEncoding & 0xF10) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR; + else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 && + (CompactUnwindEncoding & 0xF00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR; + else { + Reg1 = getDRegFromBReg(Reg1); + Reg2 = getDRegFromBReg(Reg2); + + // D8/D9 pair = 0x00000100, + // D10/D11 pair = 0x00000200, + // D12/D13 pair = 0x00000400, + // D14/D15 pair = 0x00000800 + if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 && + (CompactUnwindEncoding & 0xE00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR; + else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 && + (CompactUnwindEncoding & 0xC00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR; + else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 && + (CompactUnwindEncoding & 0x800) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR; + else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR; + else + // A pair was pushed which we cannot handle. + return CU::UNWIND_AArch64_MODE_DWARF; + } + + break; + } + } + } + + if (!HasFP) { + // With compact unwind info we can only represent stack adjustments of up + // to 65520 bytes. + if (StackSize > 65520) + return CU::UNWIND_AArch64_MODE_DWARF; + + CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS; + CompactUnwindEncoding |= encodeStackAdjustment(StackSize); + } + + return CompactUnwindEncoding; + } +}; + +} // end anonymous namespace + +namespace { + +class ELFAArch64AsmBackend : public AArch64AsmBackend { +public: + uint8_t OSABI; + bool IsLittleEndian; + + ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian) + : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian); + } + + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override; + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; +}; + +void ELFAArch64AsmBackend::processFixupValue( + const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup, + const MCFragment *DF, const MCValue &Target, uint64_t &Value, + bool &IsResolved) { + // The ADRP instruction adds some multiple of 0x1000 to the current PC & + // ~0xfff. This means that the required offset to reach a symbol can vary by + // up to one step depending on where the ADRP is in memory. For example: + // + // ADRP x0, there + // there: + // + // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and + // we'll need that as an offset. At any other address "there" will be in the + // same page as the ADRP and the instruction should encode 0x0. Assuming the + // section isn't 0x1000-aligned, we therefore need to delegate this decision + // to the linker -- a relocation! + if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) + IsResolved = false; +} + +// Returns whether this fixup is based on an address in the .eh_frame section, +// and therefore should be byte swapped. +// FIXME: Should be replaced with something more principled. +static bool isByteSwappedFixup(const MCExpr *E) { + MCValue Val; + if (!E->evaluateAsRelocatable(Val, nullptr, nullptr)) + return false; + + if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined()) + return false; + + const MCSectionELF *SecELF = + dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection()); + return SecELF->getSectionName() == ".eh_frame"; +} + +void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + // store fixups in .eh_frame section in big endian order + if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { + if (isByteSwappedFixup(Fixup.getValue())) + Value = ByteSwap_32(unsigned(Value)); + } + AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); +} +} + +MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { + if (TheTriple.isOSBinFormatMachO()) + return new DarwinAArch64AsmBackend(T, MRI); + + assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true); +} + +MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { + assert(TheTriple.isOSBinFormatELF() && + "Big endian is only supported for ELF targets!"); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFAArch64AsmBackend(T, OSABI, + /*IsLittleEndian=*/false); +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp new file mode 100644 index 0000000..1f516d1 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -0,0 +1,257 @@ +//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file handles ELF-specific object emission, converting LLVM's internal +// fixups into the appropriate relocations. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { +class AArch64ELFObjectWriter : public MCELFObjectTargetWriter { +public: + AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian); + + ~AArch64ELFObjectWriter() override; + +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + +private: +}; +} + +AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, + bool IsLittleEndian) + : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64, + /*HasRelocationAddend*/ true) {} + +AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {} + +unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + AArch64MCExpr::VariantKind RefKind = + static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind()); + AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); + bool IsNC = AArch64MCExpr::isNotChecked(RefKind); + + assert((!Target.getSymA() || + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && + "Should only be expression-level modifiers here"); + + assert((!Target.getSymB() || + Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) && + "Should only be expression-level modifiers here"); + + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + case FK_Data_2: + return ELF::R_AARCH64_PREL16; + case FK_Data_4: + return ELF::R_AARCH64_PREL32; + case FK_Data_8: + return ELF::R_AARCH64_PREL64; + case AArch64::fixup_aarch64_pcrel_adr_imm21: + assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation"); + return ELF::R_AARCH64_ADR_PREL_LO21; + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC) + return ELF::R_AARCH64_ADR_PREL_PG_HI21; + if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) + return ELF::R_AARCH64_ADR_GOT_PAGE; + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) + return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; + if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) + return ELF::R_AARCH64_TLSDESC_ADR_PAGE21; + llvm_unreachable("invalid symbol kind for ADRP relocation"); + case AArch64::fixup_aarch64_pcrel_branch26: + return ELF::R_AARCH64_JUMP26; + case AArch64::fixup_aarch64_pcrel_call26: + return ELF::R_AARCH64_CALL26; + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + if (SymLoc == AArch64MCExpr::VK_GOTTPREL) + return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19; + return ELF::R_AARCH64_LD_PREL_LO19; + case AArch64::fixup_aarch64_pcrel_branch14: + return ELF::R_AARCH64_TSTBR14; + case AArch64::fixup_aarch64_pcrel_branch19: + return ELF::R_AARCH64_CONDBR19; + default: + llvm_unreachable("Unsupported pc-relative fixup kind"); + } + } else { + switch ((unsigned)Fixup.getKind()) { + case FK_Data_2: + return ELF::R_AARCH64_ABS16; + case FK_Data_4: + return ELF::R_AARCH64_ABS32; + case FK_Data_8: + return ELF::R_AARCH64_ABS64; + case AArch64::fixup_aarch64_add_imm12: + if (RefKind == AArch64MCExpr::VK_DTPREL_HI12) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12; + if (RefKind == AArch64MCExpr::VK_TPREL_HI12) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12; + if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_LO12) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12; + if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_LO12) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12; + if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12) + return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_ADD_ABS_LO12_NC; + + report_fatal_error("invalid fixup for add (uimm12) instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale1: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST8_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 8-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale2: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST16_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 16-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale4: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST32_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 32-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale8: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST64_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) + return ELF::R_AARCH64_LD64_GOT_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) + return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC) + return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; + + report_fatal_error("invalid fixup for 64-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale16: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST128_ABS_LO12_NC; + + report_fatal_error("invalid fixup for 128-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_movw: + if (RefKind == AArch64MCExpr::VK_ABS_G3) + return ELF::R_AARCH64_MOVW_UABS_G3; + if (RefKind == AArch64MCExpr::VK_ABS_G2) + return ELF::R_AARCH64_MOVW_UABS_G2; + if (RefKind == AArch64MCExpr::VK_ABS_G2_S) + return ELF::R_AARCH64_MOVW_SABS_G2; + if (RefKind == AArch64MCExpr::VK_ABS_G2_NC) + return ELF::R_AARCH64_MOVW_UABS_G2_NC; + if (RefKind == AArch64MCExpr::VK_ABS_G1) + return ELF::R_AARCH64_MOVW_UABS_G1; + if (RefKind == AArch64MCExpr::VK_ABS_G1_S) + return ELF::R_AARCH64_MOVW_SABS_G1; + if (RefKind == AArch64MCExpr::VK_ABS_G1_NC) + return ELF::R_AARCH64_MOVW_UABS_G1_NC; + if (RefKind == AArch64MCExpr::VK_ABS_G0) + return ELF::R_AARCH64_MOVW_UABS_G0; + if (RefKind == AArch64MCExpr::VK_ABS_G0_S) + return ELF::R_AARCH64_MOVW_SABS_G0; + if (RefKind == AArch64MCExpr::VK_ABS_G0_NC) + return ELF::R_AARCH64_MOVW_UABS_G0_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_G2) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2; + if (RefKind == AArch64MCExpr::VK_DTPREL_G1) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1; + if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_G0) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0; + if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_G2) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2; + if (RefKind == AArch64MCExpr::VK_TPREL_G1) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1; + if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_G0) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0; + if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC; + if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1) + return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1; + if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC) + return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC; + report_fatal_error("invalid fixup for movz/movk instruction"); + return 0; + case AArch64::fixup_aarch64_tlsdesc_call: + return ELF::R_AARCH64_TLSDESC_CALL; + default: + llvm_unreachable("Unknown ELF relocation type"); + } + } + + llvm_unreachable("Unimplemented fixup -> relocation"); +} + +MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_pwrite_stream &OS, + uint8_t OSABI, + bool IsLittleEndian) { + MCELFObjectTargetWriter *MOTW = + new AArch64ELFObjectWriter(OSABI, IsLittleEndian); + return createELFObjectWriter(MOTW, OS, IsLittleEndian); +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp new file mode 100644 index 0000000..d26604f --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -0,0 +1,205 @@ +//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file assembles .s files and emits AArch64 ELF .o object files. Different +// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit +// regions of data and code. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetStreamer.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class AArch64ELFStreamer; + +class AArch64TargetAsmStreamer : public AArch64TargetStreamer { + formatted_raw_ostream &OS; + + void emitInst(uint32_t Inst) override; + +public: + AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); +}; + +AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS) + : AArch64TargetStreamer(S), OS(OS) {} + +void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { + OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n"; +} + +class AArch64TargetELFStreamer : public AArch64TargetStreamer { +private: + AArch64ELFStreamer &getStreamer(); + + void emitInst(uint32_t Inst) override; + +public: + AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {} +}; + +/// Extend the generic ELFStreamer class so that it can emit mapping symbols at +/// the appropriate points in the object files. These symbols are defined in the +/// AArch64 ELF ABI: +/// infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf +/// +/// In brief: $x or $d should be emitted at the start of each contiguous region +/// of A64 code or data in a section. In practice, this emission does not rely +/// on explicit assembler directives but on inherent properties of the +/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an +/// instruction). +/// +/// As a result this system is orthogonal to the DataRegion infrastructure used +/// by MachO. Beware! +class AArch64ELFStreamer : public MCELFStreamer { +public: + friend class AArch64TargetELFStreamer; + + AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, MCCodeEmitter *Emitter) + : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), + LastEMS(EMS_None) {} + + void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { + // We have to keep track of the mapping symbol state of any sections we + // use. Each one should start off as EMS_None, which is provided as the + // default constructor by DenseMap::lookup. + LastMappingSymbols[getPreviousSection().first] = LastEMS; + LastEMS = LastMappingSymbols.lookup(Section); + + MCELFStreamer::ChangeSection(Section, Subsection); + } + + /// This function is the one used to emit instruction data into the ELF + /// streamer. We override it to add the appropriate mapping symbol if + /// necessary. + void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override { + EmitA64MappingSymbol(); + MCELFStreamer::EmitInstruction(Inst, STI); + } + + void emitInst(uint32_t Inst) { + EmitA64MappingSymbol(); + MCELFStreamer::EmitIntValue(Inst, 4); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) + /// if necessary. + void EmitBytes(StringRef Data) override { + EmitDataMappingSymbol(); + MCELFStreamer::EmitBytes(Data); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) + /// if necessary. + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { + EmitDataMappingSymbol(); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); + } + +private: + enum ElfMappingSymbol { + EMS_None, + EMS_A64, + EMS_Data + }; + + void EmitDataMappingSymbol() { + if (LastEMS == EMS_Data) + return; + EmitMappingSymbol("$d"); + LastEMS = EMS_Data; + } + + void EmitA64MappingSymbol() { + if (LastEMS == EMS_A64) + return; + EmitMappingSymbol("$x"); + LastEMS = EMS_A64; + } + + void EmitMappingSymbol(StringRef Name) { + auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( + Name + "." + Twine(MappingSymbolCounter++))); + EmitLabel(Symbol); + Symbol->setType(ELF::STT_NOTYPE); + Symbol->setBinding(ELF::STB_LOCAL); + Symbol->setExternal(false); + } + + int64_t MappingSymbolCounter; + + DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols; + ElfMappingSymbol LastEMS; +}; +} // end anonymous namespace + +AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { + return static_cast<AArch64ELFStreamer &>(Streamer); +} + +void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { + getStreamer().emitInst(Inst); +} + +namespace llvm { +MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm) { + return new AArch64TargetAsmStreamer(S, OS); +} + +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); + if (RelaxAll) + S->getAssembler().setRelaxAll(true); + return S; +} + +MCTargetStreamer * +createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + const Triple &TT = STI.getTargetTriple(); + if (TT.isOSBinFormatELF()) + return new AArch64TargetELFStreamer(S); + return nullptr; +} +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h new file mode 100644 index 0000000..ef48203 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h @@ -0,0 +1,26 @@ +//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF streamer information for the AArch64 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { + +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll); +} + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h new file mode 100644 index 0000000..0f5b765 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -0,0 +1,76 @@ +//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AArch64 { + +enum Fixups { + // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into + // an ADR instruction. + fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind, + + // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into + // an ADRP instruction. + fixup_aarch64_pcrel_adrp_imm21, + + // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions. + // No alignment adjustment. All value bits are encoded. + fixup_aarch64_add_imm12, + + // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and + // store instructions. + fixup_aarch64_ldst_imm12_scale1, + fixup_aarch64_ldst_imm12_scale2, + fixup_aarch64_ldst_imm12_scale4, + fixup_aarch64_ldst_imm12_scale8, + fixup_aarch64_ldst_imm12_scale16, + + // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative + // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by + // pc-relative loads and generates relocations directly when necessary. + fixup_aarch64_ldr_pcrel_imm19, + + // FIXME: comment + fixup_aarch64_movw, + + // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative + // immediate. + fixup_aarch64_pcrel_branch14, + + // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative + // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by + // b.cc and generates relocations directly when necessary. + fixup_aarch64_pcrel_branch19, + + // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative + // immediate. + fixup_aarch64_pcrel_branch26, + + // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative + // immediate. Distinguished from branch26 only on ELF. + fixup_aarch64_pcrel_call26, + + // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF + // R_AARCH64_TLSDESC_CALL relocation. + fixup_aarch64_tlsdesc_call, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; + +} // end namespace AArch64 +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp new file mode 100644 index 0000000..fbce26e --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -0,0 +1,101 @@ +//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the AArch64MCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +enum AsmWriterVariantTy { + Default = -1, + Generic = 0, + Apple = 1 +}; + +static cl::opt<AsmWriterVariantTy> AsmWriterVariant( + "aarch64-neon-syntax", cl::init(Default), + cl::desc("Choose style of NEON code to emit from AArch64 backend:"), + cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), + clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"), + clEnumValEnd)); + +AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { + // We prefer NEON instructions to be printed in the short form. + AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; + + PrivateGlobalPrefix = "L"; + PrivateLabelPrefix = "L"; + SeparatorString = "%%"; + CommentString = ";"; + PointerSize = CalleeSaveStackSlotSize = 8; + + AlignmentIsInBytes = false; + UsesELFSectionDirectiveForBSS = true; + SupportsDebugInformation = true; + UseDataRegionDirectives = true; + + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( + const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const { + // On Darwin, we can reference dwarf symbols with foo@GOT-., which + // is an indirect pc-relative reference. The default implementation + // won't reference using the GOT, so we need this target-specific + // version. + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context); + MCSymbol *PCSym = Context.createTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context); + return MCBinaryExpr::createSub(Res, PC, Context); +} + +AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { + if (T.getArch() == Triple::aarch64_be) + IsLittleEndian = false; + + // We prefer NEON instructions to be printed in the short form. + AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; + + PointerSize = 8; + + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + CommentString = "//"; + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; + Code32Directive = ".code\t32"; + + Data16bitsDirective = "\t.hword\t"; + Data32bitsDirective = "\t.word\t"; + Data64bitsDirective = "\t.xword\t"; + + UseDataRegionDirectives = false; + + WeakRefDirective = "\t.weak\t"; + + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + UseIntegratedAssembler = true; + + HasIdentDirective = true; +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h new file mode 100644 index 0000000..253cd30 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -0,0 +1,38 @@ +//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the AArch64MCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" + +namespace llvm { +class MCStreamer; +class Target; +class Triple; + +struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit AArch64MCAsmInfoDarwin(); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +struct AArch64MCAsmInfoELF : public MCAsmInfoELF { + explicit AArch64MCAsmInfoELF(const Triple &T); +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp new file mode 100644 index 0000000..7d8e79b --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -0,0 +1,637 @@ +//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64MCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumFixups, "Number of MC fixups created."); + +namespace { + +class AArch64MCCodeEmitter : public MCCodeEmitter { + MCContext &Ctx; + + AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT +public: + AArch64MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : Ctx(ctx) {} + + ~AArch64MCCodeEmitter() override {} + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate + /// attached to a load, store or prfm instruction. If operand requires a + /// relocation, record it and return zero in that part of the encoding. + template <uint32_t FixupKind> + uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label + /// target. + uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and + /// the 2-bit shift field. + uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getCondBranchTargetOpValue - Return the encoded value for a conditional + /// branch target. + uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getLoadLiteralOpValue - Return the encoded value for a load-literal + /// pc-relative address. + uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store + /// instruction: bit 0 is whether a shift is present, bit 1 is whether the + /// operation is a sign extend (as opposed to a zero extend). + uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- + /// branch target. + uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getBranchTargetOpValue - Return the encoded value for an unconditional + /// branch target. + uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMoveWideImmOpValue - Return the encoded value for the immediate operand + /// of a MOVZ or MOVK instruction. + uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getVecShifterOpValue - Return the encoded value for the vector shifter. + uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMoveVecShifterOpValue - Return the encoded value for the vector move + /// shifter (MSL). + uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getFixedPointScaleOpValue - Return the encoded value for the + // FP-to-fixed-point scale factor. + uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getSIMDShift64OpValue - Return the encoded value for the + // shift-by-immediate AdvSIMD instructions. + uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + template<int hasRs, int hasRt2> unsigned + fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new AArch64MCCodeEmitter(MCII, Ctx); +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned +AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); + + assert(MO.isImm() && "did not expect relocated expression"); + return static_cast<unsigned>(MO.getImm()); +} + +template<unsigned FixupKind> uint32_t +AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + uint32_t ImmVal = 0; + + if (MO.isImm()) + ImmVal = static_cast<uint32_t>(MO.getImm()); + else { + assert(MO.isExpr() && "unable to encode load/store imm operand"); + MCFixupKind Kind = MCFixupKind(FixupKind); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc())); + ++MCNumFixups; + } + + return ImmVal; +} + +/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label +/// target. +uint32_t +AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + const MCExpr *Expr = MO.getExpr(); + + MCFixupKind Kind = MI.getOpcode() == AArch64::ADR + ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21) + : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + MCNumFixups += 1; + + // All of the information is in the fixup. + return 0; +} + +/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and +/// the 2-bit shift field. The shift field is stored in bits 13-14 of the +/// return value. +uint32_t +AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Suboperands are [imm, shifter]. + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL && + "unexpected shift type for add/sub immediate"); + unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm()); + assert((ShiftVal == 0 || ShiftVal == 12) && + "unexpected shift value for add/sub immediate"); + if (MO.isImm()) + return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12)); + assert(MO.isExpr() && "Unable to encode MCOperand!"); + const MCExpr *Expr = MO.getExpr(); + + // Encode the 12 bits of the fixup. + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12); + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + + ++MCNumFixups; + + return 0; +} + +/// getCondBranchTargetOpValue - Return the encoded value for a conditional +/// branch target. +uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getLoadLiteralOpValue - Return the encoded value for a load-literal +/// pc-relative address. +uint32_t +AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +uint32_t +AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + unsigned SignExtend = MI.getOperand(OpIdx).getImm(); + unsigned DoShift = MI.getOperand(OpIdx + 1).getImm(); + return (SignExtend << 1) | DoShift; +} + +uint32_t +AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected movz/movk immediate"); + + Fixups.push_back(MCFixup::create( + 0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc())); + + ++MCNumFixups; + + return 0; +} + +/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- +/// branch target. +uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected ADR target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getBranchTargetOpValue - Return the encoded value for an unconditional +/// branch target. +uint32_t +AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected ADR target type!"); + + MCFixupKind Kind = MI.getOpcode() == AArch64::BL + ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26) + : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getVecShifterOpValue - Return the encoded value for the vector shifter: +/// +/// 00 -> 0 +/// 01 -> 8 +/// 10 -> 16 +/// 11 -> 24 +uint32_t +AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + + switch (MO.getImm()) { + default: + break; + case 0: + return 0; + case 8: + return 1; + case 16: + return 2; + case 24: + return 3; + } + + llvm_unreachable("Invalid value for vector shift amount!"); +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 64 - (MO.getImm()); +} + +uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 64 - (MO.getImm() | 32); +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 32 - (MO.getImm() | 16); +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 16 - (MO.getImm() | 8); +} + +/// getFixedPointScaleOpValue - Return the encoded value for the +// FP-to-fixed-point scale factor. +uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 64 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 64 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 32 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 16 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 8 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 64; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 32; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 16; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 8; +} + +/// getMoveVecShifterOpValue - Return the encoded value for the vector move +/// shifter (MSL). +uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && + "Expected an immediate value for the move shift amount!"); + unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm()); + assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!"); + return ShiftVal == 8 ? 0 : 1; +} + +unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + // If one of the signed fixup kinds is applied to a MOVZ instruction, the + // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's + // job to ensure that any bits possibly affected by this are 0. This means we + // must zero out bit 30 (essentially emitting a MOVN). + MCOperand UImm16MO = MI.getOperand(1); + + // Nothing to do if there's no fixup. + if (UImm16MO.isImm()) + return EncodedValue; + + const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr()); + switch (A64E->getKind()) { + case AArch64MCExpr::VK_DTPREL_G2: + case AArch64MCExpr::VK_DTPREL_G1: + case AArch64MCExpr::VK_DTPREL_G0: + case AArch64MCExpr::VK_GOTTPREL_G1: + case AArch64MCExpr::VK_TPREL_G2: + case AArch64MCExpr::VK_TPREL_G1: + case AArch64MCExpr::VK_TPREL_G0: + return EncodedValue & ~(1u << 30); + default: + // Nothing to do for an unsigned fixup. + return EncodedValue; + } + + + return EncodedValue & ~(1u << 30); +} + +void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MI.getOpcode() == AArch64::TLSDESCCALL) { + // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the + // following (BLR) instruction. It doesn't emit any code itself so it + // doesn't go through the normal TableGenerated channels. + MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call); + Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup)); + return; + } + + uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); + support::endian::Writer<support::little>(OS).write<uint32_t>(Binary); + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +unsigned +AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 + // (i.e. all bits 1) but is ignored by the processor. + EncodedValue |= 0x1f << 10; + return EncodedValue; +} + +template<int hasRs, int hasRt2> unsigned +AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (!hasRs) EncodedValue |= 0x001F0000; + if (!hasRt2) EncodedValue |= 0x00007C00; + + return EncodedValue; +} + +unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( + const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const { + // The Rm field of FCMP and friends is unused - it should be assembled + // as 0, but is ignored by the processor. + EncodedValue &= ~(0x1f << 16); + return EncodedValue; +} + +#include "AArch64GenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp new file mode 100644 index 0000000..a540f49 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -0,0 +1,145 @@ +//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the assembly expression modifiers +// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...). +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCExpr.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Object/ELF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64symbolrefexpr" + +const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, VariantKind Kind, + MCContext &Ctx) { + return new (Ctx) AArch64MCExpr(Expr, Kind); +} + +StringRef AArch64MCExpr::getVariantKindName() const { + switch (static_cast<uint32_t>(getKind())) { + case VK_CALL: return ""; + case VK_LO12: return ":lo12:"; + case VK_ABS_G3: return ":abs_g3:"; + case VK_ABS_G2: return ":abs_g2:"; + case VK_ABS_G2_S: return ":abs_g2_s:"; + case VK_ABS_G2_NC: return ":abs_g2_nc:"; + case VK_ABS_G1: return ":abs_g1:"; + case VK_ABS_G1_S: return ":abs_g1_s:"; + case VK_ABS_G1_NC: return ":abs_g1_nc:"; + case VK_ABS_G0: return ":abs_g0:"; + case VK_ABS_G0_S: return ":abs_g0_s:"; + case VK_ABS_G0_NC: return ":abs_g0_nc:"; + case VK_DTPREL_G2: return ":dtprel_g2:"; + case VK_DTPREL_G1: return ":dtprel_g1:"; + case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:"; + case VK_DTPREL_G0: return ":dtprel_g0:"; + case VK_DTPREL_G0_NC: return ":dtprel_g0_nc:"; + case VK_DTPREL_HI12: return ":dtprel_hi12:"; + case VK_DTPREL_LO12: return ":dtprel_lo12:"; + case VK_DTPREL_LO12_NC: return ":dtprel_lo12_nc:"; + case VK_TPREL_G2: return ":tprel_g2:"; + case VK_TPREL_G1: return ":tprel_g1:"; + case VK_TPREL_G1_NC: return ":tprel_g1_nc:"; + case VK_TPREL_G0: return ":tprel_g0:"; + case VK_TPREL_G0_NC: return ":tprel_g0_nc:"; + case VK_TPREL_HI12: return ":tprel_hi12:"; + case VK_TPREL_LO12: return ":tprel_lo12:"; + case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:"; + case VK_TLSDESC_LO12: return ":tlsdesc_lo12:"; + case VK_ABS_PAGE: return ""; + case VK_GOT_PAGE: return ":got:"; + case VK_GOT_LO12: return ":got_lo12:"; + case VK_GOTTPREL_PAGE: return ":gottprel:"; + case VK_GOTTPREL_LO12_NC: return ":gottprel_lo12:"; + case VK_GOTTPREL_G1: return ":gottprel_g1:"; + case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:"; + case VK_TLSDESC: return ""; + case VK_TLSDESC_PAGE: return ":tlsdesc:"; + default: + llvm_unreachable("Invalid ELF symbol kind"); + } +} + +void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { + if (getKind() != VK_NONE) + OS << getVariantKindName(); + Expr->print(OS, MAI); +} + +void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); +} + +MCFragment *AArch64MCExpr::findAssociatedFragment() const { + llvm_unreachable("FIXME: what goes here?"); +} + +bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup)) + return false; + + Res = + MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind()); + + return true; +} + +static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) { + switch (Expr->getKind()) { + case MCExpr::Target: + llvm_unreachable("Can't handle nested target expression"); + break; + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr); + fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm); + fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: { + // We're known to be under a TLS fixup, so any symbol should be + // modified. There should be only one. + const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr); + cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS); + break; + } + + case MCExpr::Unary: + fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm); + break; + } +} + +void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { + switch (getSymbolLoc(Kind)) { + default: + return; + case VK_DTPREL: + case VK_GOTTPREL: + case VK_TPREL: + case VK_TLSDESC: + break; + } + + fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h new file mode 100644 index 0000000..db36a65 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -0,0 +1,167 @@ +//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes AArch64-specific MCExprs, used for modifiers like +// ":lo12:" or ":gottprel_g1:". +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H + +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +class AArch64MCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_NONE = 0x000, + + // Symbol locations specifying (roughly speaking) what calculation should be + // performed to construct the final address for the relocated + // symbol. E.g. direct, via the GOT, ... + VK_ABS = 0x001, + VK_SABS = 0x002, + VK_GOT = 0x003, + VK_DTPREL = 0x004, + VK_GOTTPREL = 0x005, + VK_TPREL = 0x006, + VK_TLSDESC = 0x007, + VK_SymLocBits = 0x00f, + + // Variants specifying which part of the final address calculation is + // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a + // MOVZ/MOVK. + VK_PAGE = 0x010, + VK_PAGEOFF = 0x020, + VK_HI12 = 0x030, + VK_G0 = 0x040, + VK_G1 = 0x050, + VK_G2 = 0x060, + VK_G3 = 0x070, + VK_AddressFragBits = 0x0f0, + + // Whether the final relocation is a checked one (where a linker should + // perform a range-check on the final address) or not. Note that this field + // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12: + // on its own is a non-checked relocation. We side with ELF on being + // explicit about this! + VK_NC = 0x100, + + // Convenience definitions for referring to specific textual representations + // of relocation specifiers. Note that this means the "_NC" is sometimes + // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC + // since a user would write ":lo12:"). + VK_CALL = VK_ABS, + VK_ABS_PAGE = VK_ABS | VK_PAGE, + VK_ABS_G3 = VK_ABS | VK_G3, + VK_ABS_G2 = VK_ABS | VK_G2, + VK_ABS_G2_S = VK_SABS | VK_G2, + VK_ABS_G2_NC = VK_ABS | VK_G2 | VK_NC, + VK_ABS_G1 = VK_ABS | VK_G1, + VK_ABS_G1_S = VK_SABS | VK_G1, + VK_ABS_G1_NC = VK_ABS | VK_G1 | VK_NC, + VK_ABS_G0 = VK_ABS | VK_G0, + VK_ABS_G0_S = VK_SABS | VK_G0, + VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC, + VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC, + VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, + VK_GOT_PAGE = VK_GOT | VK_PAGE, + VK_DTPREL_G2 = VK_DTPREL | VK_G2, + VK_DTPREL_G1 = VK_DTPREL | VK_G1, + VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, + VK_DTPREL_G0 = VK_DTPREL | VK_G0, + VK_DTPREL_G0_NC = VK_DTPREL | VK_G0 | VK_NC, + VK_DTPREL_HI12 = VK_DTPREL | VK_HI12, + VK_DTPREL_LO12 = VK_DTPREL | VK_PAGEOFF, + VK_DTPREL_LO12_NC = VK_DTPREL | VK_PAGEOFF | VK_NC, + VK_GOTTPREL_PAGE = VK_GOTTPREL | VK_PAGE, + VK_GOTTPREL_LO12_NC = VK_GOTTPREL | VK_PAGEOFF | VK_NC, + VK_GOTTPREL_G1 = VK_GOTTPREL | VK_G1, + VK_GOTTPREL_G0_NC = VK_GOTTPREL | VK_G0 | VK_NC, + VK_TPREL_G2 = VK_TPREL | VK_G2, + VK_TPREL_G1 = VK_TPREL | VK_G1, + VK_TPREL_G1_NC = VK_TPREL | VK_G1 | VK_NC, + VK_TPREL_G0 = VK_TPREL | VK_G0, + VK_TPREL_G0_NC = VK_TPREL | VK_G0 | VK_NC, + VK_TPREL_HI12 = VK_TPREL | VK_HI12, + VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF, + VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC, + VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC, + VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE, + + VK_INVALID = 0xfff + }; + +private: + const MCExpr *Expr; + const VariantKind Kind; + + explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind) + : Expr(Expr), Kind(Kind) {} + +public: + /// @name Construction + /// @{ + + static const AArch64MCExpr *create(const MCExpr *Expr, VariantKind Kind, + MCContext &Ctx); + + /// @} + /// @name Accessors + /// @{ + + /// Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// Get the expression this modifier applies to. + const MCExpr *getSubExpr() const { return Expr; } + + /// @} + /// @name VariantKind information extractors. + /// @{ + + static VariantKind getSymbolLoc(VariantKind Kind) { + return static_cast<VariantKind>(Kind & VK_SymLocBits); + } + + static VariantKind getAddressFrag(VariantKind Kind) { + return static_cast<VariantKind>(Kind & VK_AddressFragBits); + } + + static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; } + + /// @} + + /// Convert the variant kind into an ELF-appropriate modifier + /// (e.g. ":got:", ":lo12:"). + StringRef getVariantKindName() const; + + void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; + + void visitUsedExpr(MCStreamer &Streamer) const override; + + MCFragment *findAssociatedFragment() const override; + + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; + + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } + + static bool classof(const AArch64MCExpr *) { return true; } +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp new file mode 100644 index 0000000..9f7bed0 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -0,0 +1,177 @@ +//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AArch64 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCTargetDesc.h" +#include "AArch64ELFStreamer.h" +#include "AArch64MCAsmInfo.h" +#include "InstPrinter/AArch64InstPrinter.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "AArch64GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AArch64GenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AArch64GenRegisterInfo.inc" + +static MCInstrInfo *createAArch64MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAArch64MCInstrInfo(X); + return X; +} + +static MCSubtargetInfo * +createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + if (CPU.empty()) + CPU = "generic"; + + return createAArch64MCSubtargetInfoImpl(TT, CPU, FS); +} + +static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAArch64MCRegisterInfo(X, AArch64::LR); + return X; +} + +static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, + const Triple &TheTriple) { + MCAsmInfo *MAI; + if (TheTriple.isOSBinFormatMachO()) + MAI = new AArch64MCAsmInfoDarwin(); + else { + assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); + MAI = new AArch64MCAsmInfoELF(TheTriple); + } + + // Initial state of the frame pointer is SP. + unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0); + MAI->addInitialFrameState(Inst); + + return MAI; +} + +static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT, + Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) && + "Only expect Darwin and ELF targets"); + + if (CM == CodeModel::Default) + CM = CodeModel::Small; + // The default MCJIT memory managers make no guarantees about where they can + // find an executable page; JITed code needs to be able to refer to globals + // no matter how far away they are. + else if (CM == CodeModel::JITDefault) + CM = CodeModel::Large; + else if (CM != CodeModel::Small && CM != CodeModel::Large) + report_fatal_error( + "Only small and large code models are allowed on AArch64"); + + // AArch64 Darwin is always PIC. + if (TT.isOSDarwin()) + RM = Reloc::PIC_; + // On ELF platforms the default static relocation model has a smart enough + // linker to cope with referencing external symbols defined in a shared + // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. + else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) + RM = Reloc::Static; + + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + if (SyntaxVariant == 0) + return new AArch64InstPrinter(MAI, MII, MRI); + if (SyntaxVariant == 1) + return new AArch64AppleInstPrinter(MAI, MII, MRI); + + return nullptr; +} + +static MCStreamer *createELFStreamer(const Triple &T, MCContext &Ctx, + MCAsmBackend &TAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll); +} + +static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, + bool DWARFMustBeAtTheEnd) { + return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, + DWARFMustBeAtTheEnd, + /*LabelSections*/ true); +} + +// Force static initialization. +extern "C" void LLVMInitializeAArch64TargetMC() { + for (Target *T : + {&TheAArch64leTarget, &TheAArch64beTarget, &TheARM64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createAArch64MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createAArch64MCSubtargetInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(*T, createAArch64MCCodeEmitter); + + // Register the obj streamers. + TargetRegistry::RegisterELFStreamer(*T, createELFStreamer); + TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer); + + // Register the obj target streamer. + TargetRegistry::RegisterObjectTargetStreamer( + *T, createAArch64ObjectTargetStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, + createAArch64AsmTargetStreamer); + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createAArch64MCInstPrinter); + } + + // Register the asm backend. + for (Target *T : {&TheAArch64leTarget, &TheARM64Target}) + TargetRegistry::RegisterMCAsmBackend(*T, createAArch64leAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, + createAArch64beAsmBackend); +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h new file mode 100644 index 0000000..3423844 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -0,0 +1,84 @@ +//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AArch64 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class formatted_raw_ostream; +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCInstPrinter; +class MCRegisterInfo; +class MCObjectWriter; +class MCStreamer; +class MCSubtargetInfo; +class MCTargetStreamer; +class StringRef; +class Target; +class Triple; +class raw_ostream; +class raw_pwrite_stream; + +extern Target TheAArch64leTarget; +extern Target TheAArch64beTarget; +extern Target TheARM64Target; + +MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); +MCAsmBackend *createAArch64leAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); +MCAsmBackend *createAArch64beAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCObjectWriter *createAArch64ELFObjectWriter(raw_pwrite_stream &OS, + uint8_t OSABI, + bool IsLittleEndian); + +MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS, + uint32_t CPUType, + uint32_t CPUSubtype); + +MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm); + +MCTargetStreamer *createAArch64ObjectTargetStreamer(MCStreamer &S, + const MCSubtargetInfo &STI); + +} // End llvm namespace + +// Defines symbolic names for AArch64 registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "AArch64GenRegisterInfo.inc" + +// Defines symbolic names for the AArch64 instructions. +// +#define GET_INSTRINFO_ENUM +#include "AArch64GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AArch64GenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp new file mode 100644 index 0000000..61c96f1 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -0,0 +1,431 @@ +//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +using namespace llvm; + +namespace { +class AArch64MachObjectWriter : public MCMachObjectTargetWriter { + bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, + const MCSymbolRefExpr *Sym, + unsigned &Log2Size, const MCAssembler &Asm); + +public: + AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype) {} + + void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override; +}; +} + +bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( + const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, + unsigned &Log2Size, const MCAssembler &Asm) { + RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); + Log2Size = ~0U; + + switch ((unsigned)Fixup.getKind()) { + default: + return false; + + case FK_Data_1: + Log2Size = llvm::Log2_32(1); + return true; + case FK_Data_2: + Log2Size = llvm::Log2_32(2); + return true; + case FK_Data_4: + Log2Size = llvm::Log2_32(4); + if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); + return true; + case FK_Data_8: + Log2Size = llvm::Log2_32(8); + if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); + return true; + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + Log2Size = llvm::Log2_32(4); + switch (Sym->getKind()) { + default: + llvm_unreachable("Unexpected symbol reference variant kind!"); + case MCSymbolRefExpr::VK_PAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12); + return true; + case MCSymbolRefExpr::VK_GOTPAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12); + return true; + case MCSymbolRefExpr::VK_TLVPPAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12); + return true; + } + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + Log2Size = llvm::Log2_32(4); + // This encompasses the relocation for the whole 21-bit value. + switch (Sym->getKind()) { + default: { + Asm.getContext().reportError(Fixup.getLoc(), + "ADR/ADRP relocations must be GOT relative"); + return false; + } + case MCSymbolRefExpr::VK_PAGE: + RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); + return true; + case MCSymbolRefExpr::VK_GOTPAGE: + RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21); + return true; + case MCSymbolRefExpr::VK_TLVPPAGE: + RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21); + return true; + } + return true; + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + Log2Size = llvm::Log2_32(4); + RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26); + return true; + } +} + +static bool canUseLocalRelocation(const MCSectionMachO &Section, + const MCSymbol &Symbol, unsigned Log2Size) { + // Debug info sections can use local relocations. + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + return true; + + // Otherwise, only pointer sized relocations are supported. + if (Log2Size != 3) + return false; + + // But only if they don't point to a few forbidden sections. + if (!Symbol.isInSection()) + return true; + const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection()); + if (RefSec.getType() == MachO::S_CSTRING_LITERALS) + return false; + + if (RefSec.getSegmentName() == "__DATA" && + RefSec.getSectionName() == "__objc_classrefs") + return false; + + // FIXME: ld64 currently handles internal pointer-sized relocations + // incorrectly (applying the addend twice). We should be able to return true + // unconditionally by this point when that's fixed. + return false; +} + +void AArch64MachObjectWriter::recordRelocation( + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment); + unsigned Log2Size = 0; + int64_t Value = 0; + unsigned Index = 0; + unsigned Type = 0; + unsigned Kind = Fixup.getKind(); + const MCSymbol *RelSymbol = nullptr; + + FixupOffset += Fixup.getOffset(); + + // AArch64 pcrel relocation addends do not include the section offset. + if (IsPCRel) + FixedValue += FixupOffset; + + // ADRP fixups use relocations for the whole symbol value and only + // put the addend in the instruction itself. Clear out any value the + // generic code figured out from the sybmol definition. + if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21) + FixedValue = 0; + + // imm19 relocations are for conditional branches, which require + // assembler local symbols. If we got here, that's not what we have, + // so complain loudly. + if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { + Asm.getContext().reportError(Fixup.getLoc(), + "conditional branch requires assembler-local" + " label. '" + + Target.getSymA()->getSymbol().getName() + + "' is external."); + return; + } + + // 14-bit branch relocations should only target internal labels, and so + // should never get here. + if (Kind == AArch64::fixup_aarch64_pcrel_branch14) { + Asm.getContext().reportError(Fixup.getLoc(), + "Invalid relocation on conditional branch!"); + return; + } + + if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, + Asm)) { + Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); + return; + } + + Value = Target.getConstant(); + + if (Target.isAbsolute()) { // constant + // FIXME: Should this always be extern? + // SymbolNum of 0 indicates the absolute section. + Type = MachO::ARM64_RELOC_UNSIGNED; + + if (IsPCRel) { + Asm.getContext().reportError(Fixup.getLoc(), + "PC relative absolute relocation!"); + return; + + // FIXME: x86_64 sets the type to a branch reloc here. Should we do + // something similar? + } + } else if (Target.getSymB()) { // A - B + constant + const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A_Base = Asm.getAtom(*A); + + const MCSymbol *B = &Target.getSymB()->getSymbol(); + const MCSymbol *B_Base = Asm.getAtom(*B); + + // Check for "_foo@got - .", which comes through here as: + // Ltmp0: + // ... _foo@got - Ltmp0 + if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT && + Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None && + Layout.getSymbolOffset(*B) == + Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) { + // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. + Type = MachO::ARM64_RELOC_POINTER_TO_GOT; + IsPCRel = 1; + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); + return; + } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + // Otherwise, neither symbol can be modified. + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } + + // We don't support PCrel relocations of differences. + if (IsPCRel) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported pc-relative relocation of " + "difference"); + return; + } + + // AArch64 always uses external relocations. If there is no symbol to use as + // a base address (a local symbol with no preceding non-local symbol), + // error out. + // + // FIXME: We should probably just synthesize an external symbol and use + // that. + if (!A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + A->getName() + + "'. Must have non-local symbol earlier in section."); + return; + } + if (!B_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + B->getName() + + "'. Must have non-local symbol earlier in section."); + return; + } + + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } + + Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) - + (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress( + *A_Base, Layout)); + Value -= (!B->getFragment() ? 0 : Writer->getSymbolAddress(*B, Layout)) - + (!B_Base || !B_Base->getFragment() ? 0 : Writer->getSymbolAddress( + *B_Base, Layout)); + + Type = MachO::ARM64_RELOC_UNSIGNED; + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); + + RelSymbol = B_Base; + Type = MachO::ARM64_RELOC_SUBTRACTOR; + } else { // A + constant + const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + const MCSectionMachO &Section = + static_cast<const MCSectionMachO &>(*Fragment->getParent()); + + bool CanUseLocalRelocation = + canUseLocalRelocation(Section, *Symbol, Log2Size); + if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Symbol->setUsedInReloc(); + } + + const MCSymbol *Base = Asm.getAtom(*Symbol); + + // If the symbol is a variable and we weren't able to get a Base for it + // (i.e., it's not in the symbol table associated with a section) resolve + // the relocation based its expansion instead. + if (Symbol->isVariable() && !Base) { + // If the evaluation is an absolute value, just use that directly + // to keep things easy. + int64_t Res; + if (Symbol->getVariableValue()->evaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + + // FIXME: Will the Target we already have ever have any data in it + // we need to preserve and merge with the new Target? How about + // the FixedValue? + if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout, + &Fixup)) { + Asm.getContext().reportError(Fixup.getLoc(), + "unable to resolve variable '" + + Symbol->getName() + "'"); + return; + } + return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } + + // Relocations inside debug sections always use local relocations when + // possible. This seems to be done because the debugger doesn't fully + // understand relocation entries and expects to find values that + // have already been fixed up. + if (Symbol->isInSection()) { + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + Base = nullptr; + } + + // AArch64 uses external relocations as much as possible. For debug + // sections, and for pointer-sized relocations (.quad), we allow section + // relocations. It's code sections that run into trouble. + if (Base) { + RelSymbol = Base; + + // Add the local offset, if needed. + if (Base != Symbol) + Value += + Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base); + } else if (Symbol->isInSection()) { + if (!CanUseLocalRelocation) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + Symbol->getName() + + "'. Must have non-local symbol earlier in section."); + return; + } + // Adjust the relocation to be section-relative. + // The index is the section ordinal (1-based). + const MCSection &Sec = Symbol->getSection(); + Index = Sec.getOrdinal() + 1; + Value += Writer->getSymbolAddress(*Symbol, Layout); + + if (IsPCRel) + Value -= Writer->getFragmentAddress(Fragment, Layout) + + Fixup.getOffset() + (1ULL << Log2Size); + } else { + // Resolve constant variables. + if (Symbol->isVariable()) { + int64_t Res; + if (Symbol->getVariableValue()->evaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + return; + } + } + + // If the relocation kind is Branch26, Page21, or Pageoff12, any addend + // is represented via an Addend relocation, not encoded directly into + // the instruction. + if ((Type == MachO::ARM64_RELOC_BRANCH26 || + Type == MachO::ARM64_RELOC_PAGE21 || + Type == MachO::ARM64_RELOC_PAGEOFF12) && + Value) { + assert((Value & 0xff000000) == 0 && "Added relocation out of range!"); + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); + + // Now set up the Addend relocation. + Type = MachO::ARM64_RELOC_ADDEND; + Index = Value; + RelSymbol = nullptr; + IsPCRel = 0; + Log2Size = 2; + + // Put zero into the instruction itself. The addend is in the relocation. + Value = 0; + } + + // If there's any addend left to handle, encode it in the instruction. + FixedValue = Value; + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_pwrite_stream &OS, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter( + new AArch64MachObjectWriter(CPUType, CPUSubtype), OS, + /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp new file mode 100644 index 0000000..3e86a42 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -0,0 +1,41 @@ +//===- AArch64TargetStreamer.cpp - AArch64TargetStreamer class ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64TargetStreamer class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetStreamer.h" +#include "llvm/MC/ConstantPools.h" +using namespace llvm; + +// +// AArch64TargetStreamer Implemenation +// +AArch64TargetStreamer::AArch64TargetStreamer(MCStreamer &S) + : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {} + +AArch64TargetStreamer::~AArch64TargetStreamer() {} + +// The constant pool handling is shared by all AArch64TargetStreamer +// implementations. +const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr, + unsigned Size, + SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, Size, Loc); +} + +void AArch64TargetStreamer::emitCurrentConstantPool() { + ConstantPools->emitForCurrentSection(Streamer); +} + +// finish() - write out any non-empty assembler constant pools. +void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } + +void AArch64TargetStreamer::emitInst(uint32_t Inst) {} diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h new file mode 100644 index 0000000..51432830 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -0,0 +1,42 @@ +//===-- AArch64TargetStreamer.h - AArch64 Target Streamer ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64TARGETSTREAMER_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class AArch64TargetStreamer : public MCTargetStreamer { +public: + AArch64TargetStreamer(MCStreamer &S); + ~AArch64TargetStreamer() override; + + void finish() override; + + /// Callback used to implement the ldr= pseudo. + /// Add a new entry to the constant pool for the current section and return an + /// MCExpr that can be used to refer to the constant pool location. + const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc); + + /// Callback used to implemnt the .ltorg directive. + /// Emit contents of constant pool for the current section. + void emitCurrentConstantPool(); + + /// Callback used to implement the .inst directive. + virtual void emitInst(uint32_t Inst); + +private: + std::unique_ptr<AssemblerConstantPools> ConstantPools; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp new file mode 100644 index 0000000..f42ecb1 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -0,0 +1,32 @@ +//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Triple.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +namespace llvm { +Target TheAArch64leTarget; +Target TheAArch64beTarget; +Target TheARM64Target; +} // end namespace llvm + +extern "C" void LLVMInitializeAArch64TargetInfo() { + // Now register the "arm64" name for use with "-march". We don't want it to + // take possession of the Triple::aarch64 tag though. + TargetRegistry::RegisterTarget(TheARM64Target, "arm64", + "ARM64 (little endian)", + [](Triple::ArchType) { return false; }, true); + + RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z( + TheAArch64leTarget, "aarch64", "AArch64 (little endian)"); + RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W( + TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)"); + +} diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp new file mode 100644 index 0000000..78f5289 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -0,0 +1,945 @@ +//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides basic encoding and assembly information for AArch64. +// +//===----------------------------------------------------------------------===// +#include "AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Regex.h" + +using namespace llvm; + +StringRef AArch64NamedImmMapper::toString(uint32_t Value, + const FeatureBitset& FeatureBits, bool &Valid) const { + for (unsigned i = 0; i < NumMappings; ++i) { + if (Mappings[i].isValueEqual(Value, FeatureBits)) { + Valid = true; + return Mappings[i].Name; + } + } + + Valid = false; + return StringRef(); +} + +uint32_t AArch64NamedImmMapper::fromString(StringRef Name, + const FeatureBitset& FeatureBits, bool &Valid) const { + std::string LowerCaseName = Name.lower(); + for (unsigned i = 0; i < NumMappings; ++i) { + if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) { + Valid = true; + return Mappings[i].Value; + } + } + + Valid = false; + return -1; +} + +bool AArch64NamedImmMapper::validImm(uint32_t Value) const { + return Value < TooBigImm; +} + +const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = { + {"s1e1r", S1E1R, {}}, + {"s1e2r", S1E2R, {}}, + {"s1e3r", S1E3R, {}}, + {"s1e1w", S1E1W, {}}, + {"s1e2w", S1E2W, {}}, + {"s1e3w", S1E3W, {}}, + {"s1e0r", S1E0R, {}}, + {"s1e0w", S1E0W, {}}, + {"s12e1r", S12E1R, {}}, + {"s12e1w", S12E1W, {}}, + {"s12e0r", S12E0R, {}}, + {"s12e0w", S12E0W, {}}, +}; + +AArch64AT::ATMapper::ATMapper() + : AArch64NamedImmMapper(ATMappings, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = { + {"oshld", OSHLD, {}}, + {"oshst", OSHST, {}}, + {"osh", OSH, {}}, + {"nshld", NSHLD, {}}, + {"nshst", NSHST, {}}, + {"nsh", NSH, {}}, + {"ishld", ISHLD, {}}, + {"ishst", ISHST, {}}, + {"ish", ISH, {}}, + {"ld", LD, {}}, + {"st", ST, {}}, + {"sy", SY, {}} +}; + +AArch64DB::DBarrierMapper::DBarrierMapper() + : AArch64NamedImmMapper(DBarrierMappings, 16u) {} + +const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = { + {"zva", ZVA, {}}, + {"ivac", IVAC, {}}, + {"isw", ISW, {}}, + {"cvac", CVAC, {}}, + {"csw", CSW, {}}, + {"cvau", CVAU, {}}, + {"civac", CIVAC, {}}, + {"cisw", CISW, {}} +}; + +AArch64DC::DCMapper::DCMapper() + : AArch64NamedImmMapper(DCMappings, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = { + {"ialluis", IALLUIS, {}}, + {"iallu", IALLU, {}}, + {"ivau", IVAU, {}} +}; + +AArch64IC::ICMapper::ICMapper() + : AArch64NamedImmMapper(ICMappings, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = { + {"sy", SY, {}}, +}; + +AArch64ISB::ISBMapper::ISBMapper() + : AArch64NamedImmMapper(ISBMappings, 16) {} + +const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = { + {"pldl1keep", PLDL1KEEP, {}}, + {"pldl1strm", PLDL1STRM, {}}, + {"pldl2keep", PLDL2KEEP, {}}, + {"pldl2strm", PLDL2STRM, {}}, + {"pldl3keep", PLDL3KEEP, {}}, + {"pldl3strm", PLDL3STRM, {}}, + {"plil1keep", PLIL1KEEP, {}}, + {"plil1strm", PLIL1STRM, {}}, + {"plil2keep", PLIL2KEEP, {}}, + {"plil2strm", PLIL2STRM, {}}, + {"plil3keep", PLIL3KEEP, {}}, + {"plil3strm", PLIL3STRM, {}}, + {"pstl1keep", PSTL1KEEP, {}}, + {"pstl1strm", PSTL1STRM, {}}, + {"pstl2keep", PSTL2KEEP, {}}, + {"pstl2strm", PSTL2STRM, {}}, + {"pstl3keep", PSTL3KEEP, {}}, + {"pstl3strm", PSTL3STRM, {}} +}; + +AArch64PRFM::PRFMMapper::PRFMMapper() + : AArch64NamedImmMapper(PRFMMappings, 32) {} + +const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = { + {"spsel", SPSel, {}}, + {"daifset", DAIFSet, {}}, + {"daifclr", DAIFClr, {}}, + + // v8.1a "Privileged Access Never" extension-specific PStates + {"pan", PAN, {AArch64::HasV8_1aOps}}, + + // v8.2a + {"uao", UAO, {AArch64::HasV8_2aOps}}, +}; + +AArch64PState::PStateMapper::PStateMapper() + : AArch64NamedImmMapper(PStateMappings, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { + {"mdccsr_el0", MDCCSR_EL0, {}}, + {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, + {"mdrar_el1", MDRAR_EL1, {}}, + {"oslsr_el1", OSLSR_EL1, {}}, + {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}}, + {"pmceid0_el0", PMCEID0_EL0, {}}, + {"pmceid1_el0", PMCEID1_EL0, {}}, + {"midr_el1", MIDR_EL1, {}}, + {"ccsidr_el1", CCSIDR_EL1, {}}, + {"clidr_el1", CLIDR_EL1, {}}, + {"ctr_el0", CTR_EL0, {}}, + {"mpidr_el1", MPIDR_EL1, {}}, + {"revidr_el1", REVIDR_EL1, {}}, + {"aidr_el1", AIDR_EL1, {}}, + {"dczid_el0", DCZID_EL0, {}}, + {"id_pfr0_el1", ID_PFR0_EL1, {}}, + {"id_pfr1_el1", ID_PFR1_EL1, {}}, + {"id_dfr0_el1", ID_DFR0_EL1, {}}, + {"id_afr0_el1", ID_AFR0_EL1, {}}, + {"id_mmfr0_el1", ID_MMFR0_EL1, {}}, + {"id_mmfr1_el1", ID_MMFR1_EL1, {}}, + {"id_mmfr2_el1", ID_MMFR2_EL1, {}}, + {"id_mmfr3_el1", ID_MMFR3_EL1, {}}, + {"id_mmfr4_el1", ID_MMFR4_EL1, {}}, + {"id_isar0_el1", ID_ISAR0_EL1, {}}, + {"id_isar1_el1", ID_ISAR1_EL1, {}}, + {"id_isar2_el1", ID_ISAR2_EL1, {}}, + {"id_isar3_el1", ID_ISAR3_EL1, {}}, + {"id_isar4_el1", ID_ISAR4_EL1, {}}, + {"id_isar5_el1", ID_ISAR5_EL1, {}}, + {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}}, + {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}}, + {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}}, + {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}}, + {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}}, + {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}}, + {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}}, + {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, + {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, + {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, + {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, + {"mvfr0_el1", MVFR0_EL1, {}}, + {"mvfr1_el1", MVFR1_EL1, {}}, + {"mvfr2_el1", MVFR2_EL1, {}}, + {"rvbar_el1", RVBAR_EL1, {}}, + {"rvbar_el2", RVBAR_EL2, {}}, + {"rvbar_el3", RVBAR_EL3, {}}, + {"isr_el1", ISR_EL1, {}}, + {"cntpct_el0", CNTPCT_EL0, {}}, + {"cntvct_el0", CNTVCT_EL0, {}}, + + // Trace registers + {"trcstatr", TRCSTATR, {}}, + {"trcidr8", TRCIDR8, {}}, + {"trcidr9", TRCIDR9, {}}, + {"trcidr10", TRCIDR10, {}}, + {"trcidr11", TRCIDR11, {}}, + {"trcidr12", TRCIDR12, {}}, + {"trcidr13", TRCIDR13, {}}, + {"trcidr0", TRCIDR0, {}}, + {"trcidr1", TRCIDR1, {}}, + {"trcidr2", TRCIDR2, {}}, + {"trcidr3", TRCIDR3, {}}, + {"trcidr4", TRCIDR4, {}}, + {"trcidr5", TRCIDR5, {}}, + {"trcidr6", TRCIDR6, {}}, + {"trcidr7", TRCIDR7, {}}, + {"trcoslsr", TRCOSLSR, {}}, + {"trcpdsr", TRCPDSR, {}}, + {"trcdevaff0", TRCDEVAFF0, {}}, + {"trcdevaff1", TRCDEVAFF1, {}}, + {"trclsr", TRCLSR, {}}, + {"trcauthstatus", TRCAUTHSTATUS, {}}, + {"trcdevarch", TRCDEVARCH, {}}, + {"trcdevid", TRCDEVID, {}}, + {"trcdevtype", TRCDEVTYPE, {}}, + {"trcpidr4", TRCPIDR4, {}}, + {"trcpidr5", TRCPIDR5, {}}, + {"trcpidr6", TRCPIDR6, {}}, + {"trcpidr7", TRCPIDR7, {}}, + {"trcpidr0", TRCPIDR0, {}}, + {"trcpidr1", TRCPIDR1, {}}, + {"trcpidr2", TRCPIDR2, {}}, + {"trcpidr3", TRCPIDR3, {}}, + {"trccidr0", TRCCIDR0, {}}, + {"trccidr1", TRCCIDR1, {}}, + {"trccidr2", TRCCIDR2, {}}, + {"trccidr3", TRCCIDR3, {}}, + + // GICv3 registers + {"icc_iar1_el1", ICC_IAR1_EL1, {}}, + {"icc_iar0_el1", ICC_IAR0_EL1, {}}, + {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}}, + {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}}, + {"icc_rpr_el1", ICC_RPR_EL1, {}}, + {"ich_vtr_el2", ICH_VTR_EL2, {}}, + {"ich_eisr_el2", ICH_EISR_EL2, {}}, + {"ich_elsr_el2", ICH_ELSR_EL2, {}}, + + // v8.1a "Limited Ordering Regions" extension-specific system registers + {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}}, +}; + +AArch64SysReg::MRSMapper::MRSMapper() { + InstMappings = &MRSMappings[0]; + NumInstMappings = llvm::array_lengthof(MRSMappings); +} + +const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { + {"dbgdtrtx_el0", DBGDTRTX_EL0, {}}, + {"oslar_el1", OSLAR_EL1, {}}, + {"pmswinc_el0", PMSWINC_EL0, {}}, + + // Trace registers + {"trcoslar", TRCOSLAR, {}}, + {"trclar", TRCLAR, {}}, + + // GICv3 registers + {"icc_eoir1_el1", ICC_EOIR1_EL1, {}}, + {"icc_eoir0_el1", ICC_EOIR0_EL1, {}}, + {"icc_dir_el1", ICC_DIR_EL1, {}}, + {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, + {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, + {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, +}; + +AArch64SysReg::MSRMapper::MSRMapper() { + InstMappings = &MSRMappings[0]; + NumInstMappings = llvm::array_lengthof(MSRMappings); +} + + +const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = { + {"osdtrrx_el1", OSDTRRX_EL1, {}}, + {"osdtrtx_el1", OSDTRTX_EL1, {}}, + {"teecr32_el1", TEECR32_EL1, {}}, + {"mdccint_el1", MDCCINT_EL1, {}}, + {"mdscr_el1", MDSCR_EL1, {}}, + {"dbgdtr_el0", DBGDTR_EL0, {}}, + {"oseccr_el1", OSECCR_EL1, {}}, + {"dbgvcr32_el2", DBGVCR32_EL2, {}}, + {"dbgbvr0_el1", DBGBVR0_EL1, {}}, + {"dbgbvr1_el1", DBGBVR1_EL1, {}}, + {"dbgbvr2_el1", DBGBVR2_EL1, {}}, + {"dbgbvr3_el1", DBGBVR3_EL1, {}}, + {"dbgbvr4_el1", DBGBVR4_EL1, {}}, + {"dbgbvr5_el1", DBGBVR5_EL1, {}}, + {"dbgbvr6_el1", DBGBVR6_EL1, {}}, + {"dbgbvr7_el1", DBGBVR7_EL1, {}}, + {"dbgbvr8_el1", DBGBVR8_EL1, {}}, + {"dbgbvr9_el1", DBGBVR9_EL1, {}}, + {"dbgbvr10_el1", DBGBVR10_EL1, {}}, + {"dbgbvr11_el1", DBGBVR11_EL1, {}}, + {"dbgbvr12_el1", DBGBVR12_EL1, {}}, + {"dbgbvr13_el1", DBGBVR13_EL1, {}}, + {"dbgbvr14_el1", DBGBVR14_EL1, {}}, + {"dbgbvr15_el1", DBGBVR15_EL1, {}}, + {"dbgbcr0_el1", DBGBCR0_EL1, {}}, + {"dbgbcr1_el1", DBGBCR1_EL1, {}}, + {"dbgbcr2_el1", DBGBCR2_EL1, {}}, + {"dbgbcr3_el1", DBGBCR3_EL1, {}}, + {"dbgbcr4_el1", DBGBCR4_EL1, {}}, + {"dbgbcr5_el1", DBGBCR5_EL1, {}}, + {"dbgbcr6_el1", DBGBCR6_EL1, {}}, + {"dbgbcr7_el1", DBGBCR7_EL1, {}}, + {"dbgbcr8_el1", DBGBCR8_EL1, {}}, + {"dbgbcr9_el1", DBGBCR9_EL1, {}}, + {"dbgbcr10_el1", DBGBCR10_EL1, {}}, + {"dbgbcr11_el1", DBGBCR11_EL1, {}}, + {"dbgbcr12_el1", DBGBCR12_EL1, {}}, + {"dbgbcr13_el1", DBGBCR13_EL1, {}}, + {"dbgbcr14_el1", DBGBCR14_EL1, {}}, + {"dbgbcr15_el1", DBGBCR15_EL1, {}}, + {"dbgwvr0_el1", DBGWVR0_EL1, {}}, + {"dbgwvr1_el1", DBGWVR1_EL1, {}}, + {"dbgwvr2_el1", DBGWVR2_EL1, {}}, + {"dbgwvr3_el1", DBGWVR3_EL1, {}}, + {"dbgwvr4_el1", DBGWVR4_EL1, {}}, + {"dbgwvr5_el1", DBGWVR5_EL1, {}}, + {"dbgwvr6_el1", DBGWVR6_EL1, {}}, + {"dbgwvr7_el1", DBGWVR7_EL1, {}}, + {"dbgwvr8_el1", DBGWVR8_EL1, {}}, + {"dbgwvr9_el1", DBGWVR9_EL1, {}}, + {"dbgwvr10_el1", DBGWVR10_EL1, {}}, + {"dbgwvr11_el1", DBGWVR11_EL1, {}}, + {"dbgwvr12_el1", DBGWVR12_EL1, {}}, + {"dbgwvr13_el1", DBGWVR13_EL1, {}}, + {"dbgwvr14_el1", DBGWVR14_EL1, {}}, + {"dbgwvr15_el1", DBGWVR15_EL1, {}}, + {"dbgwcr0_el1", DBGWCR0_EL1, {}}, + {"dbgwcr1_el1", DBGWCR1_EL1, {}}, + {"dbgwcr2_el1", DBGWCR2_EL1, {}}, + {"dbgwcr3_el1", DBGWCR3_EL1, {}}, + {"dbgwcr4_el1", DBGWCR4_EL1, {}}, + {"dbgwcr5_el1", DBGWCR5_EL1, {}}, + {"dbgwcr6_el1", DBGWCR6_EL1, {}}, + {"dbgwcr7_el1", DBGWCR7_EL1, {}}, + {"dbgwcr8_el1", DBGWCR8_EL1, {}}, + {"dbgwcr9_el1", DBGWCR9_EL1, {}}, + {"dbgwcr10_el1", DBGWCR10_EL1, {}}, + {"dbgwcr11_el1", DBGWCR11_EL1, {}}, + {"dbgwcr12_el1", DBGWCR12_EL1, {}}, + {"dbgwcr13_el1", DBGWCR13_EL1, {}}, + {"dbgwcr14_el1", DBGWCR14_EL1, {}}, + {"dbgwcr15_el1", DBGWCR15_EL1, {}}, + {"teehbr32_el1", TEEHBR32_EL1, {}}, + {"osdlr_el1", OSDLR_EL1, {}}, + {"dbgprcr_el1", DBGPRCR_EL1, {}}, + {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}}, + {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}}, + {"csselr_el1", CSSELR_EL1, {}}, + {"vpidr_el2", VPIDR_EL2, {}}, + {"vmpidr_el2", VMPIDR_EL2, {}}, + {"sctlr_el1", SCTLR_EL1, {}}, + {"sctlr_el2", SCTLR_EL2, {}}, + {"sctlr_el3", SCTLR_EL3, {}}, + {"actlr_el1", ACTLR_EL1, {}}, + {"actlr_el2", ACTLR_EL2, {}}, + {"actlr_el3", ACTLR_EL3, {}}, + {"cpacr_el1", CPACR_EL1, {}}, + {"hcr_el2", HCR_EL2, {}}, + {"scr_el3", SCR_EL3, {}}, + {"mdcr_el2", MDCR_EL2, {}}, + {"sder32_el3", SDER32_EL3, {}}, + {"cptr_el2", CPTR_EL2, {}}, + {"cptr_el3", CPTR_EL3, {}}, + {"hstr_el2", HSTR_EL2, {}}, + {"hacr_el2", HACR_EL2, {}}, + {"mdcr_el3", MDCR_EL3, {}}, + {"ttbr0_el1", TTBR0_EL1, {}}, + {"ttbr0_el2", TTBR0_EL2, {}}, + {"ttbr0_el3", TTBR0_EL3, {}}, + {"ttbr1_el1", TTBR1_EL1, {}}, + {"tcr_el1", TCR_EL1, {}}, + {"tcr_el2", TCR_EL2, {}}, + {"tcr_el3", TCR_EL3, {}}, + {"vttbr_el2", VTTBR_EL2, {}}, + {"vtcr_el2", VTCR_EL2, {}}, + {"dacr32_el2", DACR32_EL2, {}}, + {"spsr_el1", SPSR_EL1, {}}, + {"spsr_el2", SPSR_EL2, {}}, + {"spsr_el3", SPSR_EL3, {}}, + {"elr_el1", ELR_EL1, {}}, + {"elr_el2", ELR_EL2, {}}, + {"elr_el3", ELR_EL3, {}}, + {"sp_el0", SP_EL0, {}}, + {"sp_el1", SP_EL1, {}}, + {"sp_el2", SP_EL2, {}}, + {"spsel", SPSel, {}}, + {"nzcv", NZCV, {}}, + {"daif", DAIF, {}}, + {"currentel", CurrentEL, {}}, + {"spsr_irq", SPSR_irq, {}}, + {"spsr_abt", SPSR_abt, {}}, + {"spsr_und", SPSR_und, {}}, + {"spsr_fiq", SPSR_fiq, {}}, + {"fpcr", FPCR, {}}, + {"fpsr", FPSR, {}}, + {"dspsr_el0", DSPSR_EL0, {}}, + {"dlr_el0", DLR_EL0, {}}, + {"ifsr32_el2", IFSR32_EL2, {}}, + {"afsr0_el1", AFSR0_EL1, {}}, + {"afsr0_el2", AFSR0_EL2, {}}, + {"afsr0_el3", AFSR0_EL3, {}}, + {"afsr1_el1", AFSR1_EL1, {}}, + {"afsr1_el2", AFSR1_EL2, {}}, + {"afsr1_el3", AFSR1_EL3, {}}, + {"esr_el1", ESR_EL1, {}}, + {"esr_el2", ESR_EL2, {}}, + {"esr_el3", ESR_EL3, {}}, + {"fpexc32_el2", FPEXC32_EL2, {}}, + {"far_el1", FAR_EL1, {}}, + {"far_el2", FAR_EL2, {}}, + {"far_el3", FAR_EL3, {}}, + {"hpfar_el2", HPFAR_EL2, {}}, + {"par_el1", PAR_EL1, {}}, + {"pmcr_el0", PMCR_EL0, {}}, + {"pmcntenset_el0", PMCNTENSET_EL0, {}}, + {"pmcntenclr_el0", PMCNTENCLR_EL0, {}}, + {"pmovsclr_el0", PMOVSCLR_EL0, {}}, + {"pmselr_el0", PMSELR_EL0, {}}, + {"pmccntr_el0", PMCCNTR_EL0, {}}, + {"pmxevtyper_el0", PMXEVTYPER_EL0, {}}, + {"pmxevcntr_el0", PMXEVCNTR_EL0, {}}, + {"pmuserenr_el0", PMUSERENR_EL0, {}}, + {"pmintenset_el1", PMINTENSET_EL1, {}}, + {"pmintenclr_el1", PMINTENCLR_EL1, {}}, + {"pmovsset_el0", PMOVSSET_EL0, {}}, + {"mair_el1", MAIR_EL1, {}}, + {"mair_el2", MAIR_EL2, {}}, + {"mair_el3", MAIR_EL3, {}}, + {"amair_el1", AMAIR_EL1, {}}, + {"amair_el2", AMAIR_EL2, {}}, + {"amair_el3", AMAIR_EL3, {}}, + {"vbar_el1", VBAR_EL1, {}}, + {"vbar_el2", VBAR_EL2, {}}, + {"vbar_el3", VBAR_EL3, {}}, + {"rmr_el1", RMR_EL1, {}}, + {"rmr_el2", RMR_EL2, {}}, + {"rmr_el3", RMR_EL3, {}}, + {"contextidr_el1", CONTEXTIDR_EL1, {}}, + {"tpidr_el0", TPIDR_EL0, {}}, + {"tpidr_el2", TPIDR_EL2, {}}, + {"tpidr_el3", TPIDR_EL3, {}}, + {"tpidrro_el0", TPIDRRO_EL0, {}}, + {"tpidr_el1", TPIDR_EL1, {}}, + {"cntfrq_el0", CNTFRQ_EL0, {}}, + {"cntvoff_el2", CNTVOFF_EL2, {}}, + {"cntkctl_el1", CNTKCTL_EL1, {}}, + {"cnthctl_el2", CNTHCTL_EL2, {}}, + {"cntp_tval_el0", CNTP_TVAL_EL0, {}}, + {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}}, + {"cntps_tval_el1", CNTPS_TVAL_EL1, {}}, + {"cntp_ctl_el0", CNTP_CTL_EL0, {}}, + {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}}, + {"cntps_ctl_el1", CNTPS_CTL_EL1, {}}, + {"cntp_cval_el0", CNTP_CVAL_EL0, {}}, + {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}}, + {"cntps_cval_el1", CNTPS_CVAL_EL1, {}}, + {"cntv_tval_el0", CNTV_TVAL_EL0, {}}, + {"cntv_ctl_el0", CNTV_CTL_EL0, {}}, + {"cntv_cval_el0", CNTV_CVAL_EL0, {}}, + {"pmevcntr0_el0", PMEVCNTR0_EL0, {}}, + {"pmevcntr1_el0", PMEVCNTR1_EL0, {}}, + {"pmevcntr2_el0", PMEVCNTR2_EL0, {}}, + {"pmevcntr3_el0", PMEVCNTR3_EL0, {}}, + {"pmevcntr4_el0", PMEVCNTR4_EL0, {}}, + {"pmevcntr5_el0", PMEVCNTR5_EL0, {}}, + {"pmevcntr6_el0", PMEVCNTR6_EL0, {}}, + {"pmevcntr7_el0", PMEVCNTR7_EL0, {}}, + {"pmevcntr8_el0", PMEVCNTR8_EL0, {}}, + {"pmevcntr9_el0", PMEVCNTR9_EL0, {}}, + {"pmevcntr10_el0", PMEVCNTR10_EL0, {}}, + {"pmevcntr11_el0", PMEVCNTR11_EL0, {}}, + {"pmevcntr12_el0", PMEVCNTR12_EL0, {}}, + {"pmevcntr13_el0", PMEVCNTR13_EL0, {}}, + {"pmevcntr14_el0", PMEVCNTR14_EL0, {}}, + {"pmevcntr15_el0", PMEVCNTR15_EL0, {}}, + {"pmevcntr16_el0", PMEVCNTR16_EL0, {}}, + {"pmevcntr17_el0", PMEVCNTR17_EL0, {}}, + {"pmevcntr18_el0", PMEVCNTR18_EL0, {}}, + {"pmevcntr19_el0", PMEVCNTR19_EL0, {}}, + {"pmevcntr20_el0", PMEVCNTR20_EL0, {}}, + {"pmevcntr21_el0", PMEVCNTR21_EL0, {}}, + {"pmevcntr22_el0", PMEVCNTR22_EL0, {}}, + {"pmevcntr23_el0", PMEVCNTR23_EL0, {}}, + {"pmevcntr24_el0", PMEVCNTR24_EL0, {}}, + {"pmevcntr25_el0", PMEVCNTR25_EL0, {}}, + {"pmevcntr26_el0", PMEVCNTR26_EL0, {}}, + {"pmevcntr27_el0", PMEVCNTR27_EL0, {}}, + {"pmevcntr28_el0", PMEVCNTR28_EL0, {}}, + {"pmevcntr29_el0", PMEVCNTR29_EL0, {}}, + {"pmevcntr30_el0", PMEVCNTR30_EL0, {}}, + {"pmccfiltr_el0", PMCCFILTR_EL0, {}}, + {"pmevtyper0_el0", PMEVTYPER0_EL0, {}}, + {"pmevtyper1_el0", PMEVTYPER1_EL0, {}}, + {"pmevtyper2_el0", PMEVTYPER2_EL0, {}}, + {"pmevtyper3_el0", PMEVTYPER3_EL0, {}}, + {"pmevtyper4_el0", PMEVTYPER4_EL0, {}}, + {"pmevtyper5_el0", PMEVTYPER5_EL0, {}}, + {"pmevtyper6_el0", PMEVTYPER6_EL0, {}}, + {"pmevtyper7_el0", PMEVTYPER7_EL0, {}}, + {"pmevtyper8_el0", PMEVTYPER8_EL0, {}}, + {"pmevtyper9_el0", PMEVTYPER9_EL0, {}}, + {"pmevtyper10_el0", PMEVTYPER10_EL0, {}}, + {"pmevtyper11_el0", PMEVTYPER11_EL0, {}}, + {"pmevtyper12_el0", PMEVTYPER12_EL0, {}}, + {"pmevtyper13_el0", PMEVTYPER13_EL0, {}}, + {"pmevtyper14_el0", PMEVTYPER14_EL0, {}}, + {"pmevtyper15_el0", PMEVTYPER15_EL0, {}}, + {"pmevtyper16_el0", PMEVTYPER16_EL0, {}}, + {"pmevtyper17_el0", PMEVTYPER17_EL0, {}}, + {"pmevtyper18_el0", PMEVTYPER18_EL0, {}}, + {"pmevtyper19_el0", PMEVTYPER19_EL0, {}}, + {"pmevtyper20_el0", PMEVTYPER20_EL0, {}}, + {"pmevtyper21_el0", PMEVTYPER21_EL0, {}}, + {"pmevtyper22_el0", PMEVTYPER22_EL0, {}}, + {"pmevtyper23_el0", PMEVTYPER23_EL0, {}}, + {"pmevtyper24_el0", PMEVTYPER24_EL0, {}}, + {"pmevtyper25_el0", PMEVTYPER25_EL0, {}}, + {"pmevtyper26_el0", PMEVTYPER26_EL0, {}}, + {"pmevtyper27_el0", PMEVTYPER27_EL0, {}}, + {"pmevtyper28_el0", PMEVTYPER28_EL0, {}}, + {"pmevtyper29_el0", PMEVTYPER29_EL0, {}}, + {"pmevtyper30_el0", PMEVTYPER30_EL0, {}}, + + // Trace registers + {"trcprgctlr", TRCPRGCTLR, {}}, + {"trcprocselr", TRCPROCSELR, {}}, + {"trcconfigr", TRCCONFIGR, {}}, + {"trcauxctlr", TRCAUXCTLR, {}}, + {"trceventctl0r", TRCEVENTCTL0R, {}}, + {"trceventctl1r", TRCEVENTCTL1R, {}}, + {"trcstallctlr", TRCSTALLCTLR, {}}, + {"trctsctlr", TRCTSCTLR, {}}, + {"trcsyncpr", TRCSYNCPR, {}}, + {"trcccctlr", TRCCCCTLR, {}}, + {"trcbbctlr", TRCBBCTLR, {}}, + {"trctraceidr", TRCTRACEIDR, {}}, + {"trcqctlr", TRCQCTLR, {}}, + {"trcvictlr", TRCVICTLR, {}}, + {"trcviiectlr", TRCVIIECTLR, {}}, + {"trcvissctlr", TRCVISSCTLR, {}}, + {"trcvipcssctlr", TRCVIPCSSCTLR, {}}, + {"trcvdctlr", TRCVDCTLR, {}}, + {"trcvdsacctlr", TRCVDSACCTLR, {}}, + {"trcvdarcctlr", TRCVDARCCTLR, {}}, + {"trcseqevr0", TRCSEQEVR0, {}}, + {"trcseqevr1", TRCSEQEVR1, {}}, + {"trcseqevr2", TRCSEQEVR2, {}}, + {"trcseqrstevr", TRCSEQRSTEVR, {}}, + {"trcseqstr", TRCSEQSTR, {}}, + {"trcextinselr", TRCEXTINSELR, {}}, + {"trccntrldvr0", TRCCNTRLDVR0, {}}, + {"trccntrldvr1", TRCCNTRLDVR1, {}}, + {"trccntrldvr2", TRCCNTRLDVR2, {}}, + {"trccntrldvr3", TRCCNTRLDVR3, {}}, + {"trccntctlr0", TRCCNTCTLR0, {}}, + {"trccntctlr1", TRCCNTCTLR1, {}}, + {"trccntctlr2", TRCCNTCTLR2, {}}, + {"trccntctlr3", TRCCNTCTLR3, {}}, + {"trccntvr0", TRCCNTVR0, {}}, + {"trccntvr1", TRCCNTVR1, {}}, + {"trccntvr2", TRCCNTVR2, {}}, + {"trccntvr3", TRCCNTVR3, {}}, + {"trcimspec0", TRCIMSPEC0, {}}, + {"trcimspec1", TRCIMSPEC1, {}}, + {"trcimspec2", TRCIMSPEC2, {}}, + {"trcimspec3", TRCIMSPEC3, {}}, + {"trcimspec4", TRCIMSPEC4, {}}, + {"trcimspec5", TRCIMSPEC5, {}}, + {"trcimspec6", TRCIMSPEC6, {}}, + {"trcimspec7", TRCIMSPEC7, {}}, + {"trcrsctlr2", TRCRSCTLR2, {}}, + {"trcrsctlr3", TRCRSCTLR3, {}}, + {"trcrsctlr4", TRCRSCTLR4, {}}, + {"trcrsctlr5", TRCRSCTLR5, {}}, + {"trcrsctlr6", TRCRSCTLR6, {}}, + {"trcrsctlr7", TRCRSCTLR7, {}}, + {"trcrsctlr8", TRCRSCTLR8, {}}, + {"trcrsctlr9", TRCRSCTLR9, {}}, + {"trcrsctlr10", TRCRSCTLR10, {}}, + {"trcrsctlr11", TRCRSCTLR11, {}}, + {"trcrsctlr12", TRCRSCTLR12, {}}, + {"trcrsctlr13", TRCRSCTLR13, {}}, + {"trcrsctlr14", TRCRSCTLR14, {}}, + {"trcrsctlr15", TRCRSCTLR15, {}}, + {"trcrsctlr16", TRCRSCTLR16, {}}, + {"trcrsctlr17", TRCRSCTLR17, {}}, + {"trcrsctlr18", TRCRSCTLR18, {}}, + {"trcrsctlr19", TRCRSCTLR19, {}}, + {"trcrsctlr20", TRCRSCTLR20, {}}, + {"trcrsctlr21", TRCRSCTLR21, {}}, + {"trcrsctlr22", TRCRSCTLR22, {}}, + {"trcrsctlr23", TRCRSCTLR23, {}}, + {"trcrsctlr24", TRCRSCTLR24, {}}, + {"trcrsctlr25", TRCRSCTLR25, {}}, + {"trcrsctlr26", TRCRSCTLR26, {}}, + {"trcrsctlr27", TRCRSCTLR27, {}}, + {"trcrsctlr28", TRCRSCTLR28, {}}, + {"trcrsctlr29", TRCRSCTLR29, {}}, + {"trcrsctlr30", TRCRSCTLR30, {}}, + {"trcrsctlr31", TRCRSCTLR31, {}}, + {"trcssccr0", TRCSSCCR0, {}}, + {"trcssccr1", TRCSSCCR1, {}}, + {"trcssccr2", TRCSSCCR2, {}}, + {"trcssccr3", TRCSSCCR3, {}}, + {"trcssccr4", TRCSSCCR4, {}}, + {"trcssccr5", TRCSSCCR5, {}}, + {"trcssccr6", TRCSSCCR6, {}}, + {"trcssccr7", TRCSSCCR7, {}}, + {"trcsscsr0", TRCSSCSR0, {}}, + {"trcsscsr1", TRCSSCSR1, {}}, + {"trcsscsr2", TRCSSCSR2, {}}, + {"trcsscsr3", TRCSSCSR3, {}}, + {"trcsscsr4", TRCSSCSR4, {}}, + {"trcsscsr5", TRCSSCSR5, {}}, + {"trcsscsr6", TRCSSCSR6, {}}, + {"trcsscsr7", TRCSSCSR7, {}}, + {"trcsspcicr0", TRCSSPCICR0, {}}, + {"trcsspcicr1", TRCSSPCICR1, {}}, + {"trcsspcicr2", TRCSSPCICR2, {}}, + {"trcsspcicr3", TRCSSPCICR3, {}}, + {"trcsspcicr4", TRCSSPCICR4, {}}, + {"trcsspcicr5", TRCSSPCICR5, {}}, + {"trcsspcicr6", TRCSSPCICR6, {}}, + {"trcsspcicr7", TRCSSPCICR7, {}}, + {"trcpdcr", TRCPDCR, {}}, + {"trcacvr0", TRCACVR0, {}}, + {"trcacvr1", TRCACVR1, {}}, + {"trcacvr2", TRCACVR2, {}}, + {"trcacvr3", TRCACVR3, {}}, + {"trcacvr4", TRCACVR4, {}}, + {"trcacvr5", TRCACVR5, {}}, + {"trcacvr6", TRCACVR6, {}}, + {"trcacvr7", TRCACVR7, {}}, + {"trcacvr8", TRCACVR8, {}}, + {"trcacvr9", TRCACVR9, {}}, + {"trcacvr10", TRCACVR10, {}}, + {"trcacvr11", TRCACVR11, {}}, + {"trcacvr12", TRCACVR12, {}}, + {"trcacvr13", TRCACVR13, {}}, + {"trcacvr14", TRCACVR14, {}}, + {"trcacvr15", TRCACVR15, {}}, + {"trcacatr0", TRCACATR0, {}}, + {"trcacatr1", TRCACATR1, {}}, + {"trcacatr2", TRCACATR2, {}}, + {"trcacatr3", TRCACATR3, {}}, + {"trcacatr4", TRCACATR4, {}}, + {"trcacatr5", TRCACATR5, {}}, + {"trcacatr6", TRCACATR6, {}}, + {"trcacatr7", TRCACATR7, {}}, + {"trcacatr8", TRCACATR8, {}}, + {"trcacatr9", TRCACATR9, {}}, + {"trcacatr10", TRCACATR10, {}}, + {"trcacatr11", TRCACATR11, {}}, + {"trcacatr12", TRCACATR12, {}}, + {"trcacatr13", TRCACATR13, {}}, + {"trcacatr14", TRCACATR14, {}}, + {"trcacatr15", TRCACATR15, {}}, + {"trcdvcvr0", TRCDVCVR0, {}}, + {"trcdvcvr1", TRCDVCVR1, {}}, + {"trcdvcvr2", TRCDVCVR2, {}}, + {"trcdvcvr3", TRCDVCVR3, {}}, + {"trcdvcvr4", TRCDVCVR4, {}}, + {"trcdvcvr5", TRCDVCVR5, {}}, + {"trcdvcvr6", TRCDVCVR6, {}}, + {"trcdvcvr7", TRCDVCVR7, {}}, + {"trcdvcmr0", TRCDVCMR0, {}}, + {"trcdvcmr1", TRCDVCMR1, {}}, + {"trcdvcmr2", TRCDVCMR2, {}}, + {"trcdvcmr3", TRCDVCMR3, {}}, + {"trcdvcmr4", TRCDVCMR4, {}}, + {"trcdvcmr5", TRCDVCMR5, {}}, + {"trcdvcmr6", TRCDVCMR6, {}}, + {"trcdvcmr7", TRCDVCMR7, {}}, + {"trccidcvr0", TRCCIDCVR0, {}}, + {"trccidcvr1", TRCCIDCVR1, {}}, + {"trccidcvr2", TRCCIDCVR2, {}}, + {"trccidcvr3", TRCCIDCVR3, {}}, + {"trccidcvr4", TRCCIDCVR4, {}}, + {"trccidcvr5", TRCCIDCVR5, {}}, + {"trccidcvr6", TRCCIDCVR6, {}}, + {"trccidcvr7", TRCCIDCVR7, {}}, + {"trcvmidcvr0", TRCVMIDCVR0, {}}, + {"trcvmidcvr1", TRCVMIDCVR1, {}}, + {"trcvmidcvr2", TRCVMIDCVR2, {}}, + {"trcvmidcvr3", TRCVMIDCVR3, {}}, + {"trcvmidcvr4", TRCVMIDCVR4, {}}, + {"trcvmidcvr5", TRCVMIDCVR5, {}}, + {"trcvmidcvr6", TRCVMIDCVR6, {}}, + {"trcvmidcvr7", TRCVMIDCVR7, {}}, + {"trccidcctlr0", TRCCIDCCTLR0, {}}, + {"trccidcctlr1", TRCCIDCCTLR1, {}}, + {"trcvmidcctlr0", TRCVMIDCCTLR0, {}}, + {"trcvmidcctlr1", TRCVMIDCCTLR1, {}}, + {"trcitctrl", TRCITCTRL, {}}, + {"trcclaimset", TRCCLAIMSET, {}}, + {"trcclaimclr", TRCCLAIMCLR, {}}, + + // GICv3 registers + {"icc_bpr1_el1", ICC_BPR1_EL1, {}}, + {"icc_bpr0_el1", ICC_BPR0_EL1, {}}, + {"icc_pmr_el1", ICC_PMR_EL1, {}}, + {"icc_ctlr_el1", ICC_CTLR_EL1, {}}, + {"icc_ctlr_el3", ICC_CTLR_EL3, {}}, + {"icc_sre_el1", ICC_SRE_EL1, {}}, + {"icc_sre_el2", ICC_SRE_EL2, {}}, + {"icc_sre_el3", ICC_SRE_EL3, {}}, + {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}}, + {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}}, + {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}}, + {"icc_seien_el1", ICC_SEIEN_EL1, {}}, + {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}}, + {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}}, + {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}}, + {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}}, + {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}}, + {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}}, + {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}}, + {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}}, + {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}}, + {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}}, + {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}}, + {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}}, + {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}}, + {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}}, + {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}}, + {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}}, + {"ich_hcr_el2", ICH_HCR_EL2, {}}, + {"ich_misr_el2", ICH_MISR_EL2, {}}, + {"ich_vmcr_el2", ICH_VMCR_EL2, {}}, + {"ich_vseir_el2", ICH_VSEIR_EL2, {}}, + {"ich_lr0_el2", ICH_LR0_EL2, {}}, + {"ich_lr1_el2", ICH_LR1_EL2, {}}, + {"ich_lr2_el2", ICH_LR2_EL2, {}}, + {"ich_lr3_el2", ICH_LR3_EL2, {}}, + {"ich_lr4_el2", ICH_LR4_EL2, {}}, + {"ich_lr5_el2", ICH_LR5_EL2, {}}, + {"ich_lr6_el2", ICH_LR6_EL2, {}}, + {"ich_lr7_el2", ICH_LR7_EL2, {}}, + {"ich_lr8_el2", ICH_LR8_EL2, {}}, + {"ich_lr9_el2", ICH_LR9_EL2, {}}, + {"ich_lr10_el2", ICH_LR10_EL2, {}}, + {"ich_lr11_el2", ICH_LR11_EL2, {}}, + {"ich_lr12_el2", ICH_LR12_EL2, {}}, + {"ich_lr13_el2", ICH_LR13_EL2, {}}, + {"ich_lr14_el2", ICH_LR14_EL2, {}}, + {"ich_lr15_el2", ICH_LR15_EL2, {}}, + + // Cyclone registers + {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}}, + + // v8.1a "Privileged Access Never" extension-specific system registers + {"pan", PAN, {AArch64::HasV8_1aOps}}, + + // v8.1a "Limited Ordering Regions" extension-specific system registers + {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}}, + {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}}, + {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}}, + {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}}, + + // v8.1a "Virtualization host extensions" system registers + {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}}, + {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}}, + {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}}, + {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}}, + {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}}, + {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}}, + {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}}, + {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}}, + {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}}, + {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}}, + {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}}, + {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}}, + {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}}, + {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}}, + {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}}, + {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}}, + {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}}, + {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}}, + {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}}, + {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}}, + {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}}, + {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}}, + {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}}, + {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}}, + {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, + {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, + {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, + + // v8.2a registers + {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, +}; + +uint32_t +AArch64SysReg::SysRegMapper::fromString(StringRef Name, + const FeatureBitset& FeatureBits, bool &Valid) const { + std::string NameLower = Name.lower(); + + // First search the registers shared by all + for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { + if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) { + Valid = true; + return SysRegMappings[i].Value; + } + } + + // Now try the instruction-specific registers (either read-only or + // write-only). + for (unsigned i = 0; i < NumInstMappings; ++i) { + if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) { + Valid = true; + return InstMappings[i].Value; + } + } + + // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name + Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$"); + + SmallVector<StringRef, 5> Ops; + if (!GenericRegPattern.match(NameLower, &Ops)) { + Valid = false; + return -1; + } + + uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; + uint32_t Bits; + Ops[1].getAsInteger(10, Op0); + Ops[2].getAsInteger(10, Op1); + Ops[3].getAsInteger(10, CRn); + Ops[4].getAsInteger(10, CRm); + Ops[5].getAsInteger(10, Op2); + Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; + + Valid = true; + return Bits; +} + +std::string +AArch64SysReg::SysRegMapper::toString(uint32_t Bits, + const FeatureBitset& FeatureBits) const { + // First search the registers shared by all + for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) { + if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) { + return SysRegMappings[i].Name; + } + } + + // Now try the instruction-specific registers (either read-only or + // write-only). + for (unsigned i = 0; i < NumInstMappings; ++i) { + if (InstMappings[i].isValueEqual(Bits, FeatureBits)) { + return InstMappings[i].Name; + } + } + + assert(Bits < 0x10000); + uint32_t Op0 = (Bits >> 14) & 0x3; + uint32_t Op1 = (Bits >> 11) & 0x7; + uint32_t CRn = (Bits >> 7) & 0xf; + uint32_t CRm = (Bits >> 3) & 0xf; + uint32_t Op2 = Bits & 0x7; + + return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn) + + "_c" + utostr(CRm) + "_" + utostr(Op2); +} + +const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = { + {"ipas2e1is", IPAS2E1IS, {}}, + {"ipas2le1is", IPAS2LE1IS, {}}, + {"vmalle1is", VMALLE1IS, {}}, + {"alle2is", ALLE2IS, {}}, + {"alle3is", ALLE3IS, {}}, + {"vae1is", VAE1IS, {}}, + {"vae2is", VAE2IS, {}}, + {"vae3is", VAE3IS, {}}, + {"aside1is", ASIDE1IS, {}}, + {"vaae1is", VAAE1IS, {}}, + {"alle1is", ALLE1IS, {}}, + {"vale1is", VALE1IS, {}}, + {"vale2is", VALE2IS, {}}, + {"vale3is", VALE3IS, {}}, + {"vmalls12e1is", VMALLS12E1IS, {}}, + {"vaale1is", VAALE1IS, {}}, + {"ipas2e1", IPAS2E1, {}}, + {"ipas2le1", IPAS2LE1, {}}, + {"vmalle1", VMALLE1, {}}, + {"alle2", ALLE2, {}}, + {"alle3", ALLE3, {}}, + {"vae1", VAE1, {}}, + {"vae2", VAE2, {}}, + {"vae3", VAE3, {}}, + {"aside1", ASIDE1, {}}, + {"vaae1", VAAE1, {}}, + {"alle1", ALLE1, {}}, + {"vale1", VALE1, {}}, + {"vale2", VALE2, {}}, + {"vale3", VALE3, {}}, + {"vmalls12e1", VMALLS12E1, {}}, + {"vaale1", VAALE1, {}} +}; + +AArch64TLBI::TLBIMapper::TLBIMapper() + : AArch64NamedImmMapper(TLBIMappings, 0) {} diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h new file mode 100644 index 0000000..f649cb9 --- /dev/null +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -0,0 +1,1393 @@ +//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the AArch64 target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H +#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H + +// FIXME: Is it easiest to fix this layering violation by moving the .inc +// #includes from AArch64MCTargetDesc.h to here? +#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends. +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +inline static unsigned getWRegFromXReg(unsigned Reg) { + switch (Reg) { + case AArch64::X0: return AArch64::W0; + case AArch64::X1: return AArch64::W1; + case AArch64::X2: return AArch64::W2; + case AArch64::X3: return AArch64::W3; + case AArch64::X4: return AArch64::W4; + case AArch64::X5: return AArch64::W5; + case AArch64::X6: return AArch64::W6; + case AArch64::X7: return AArch64::W7; + case AArch64::X8: return AArch64::W8; + case AArch64::X9: return AArch64::W9; + case AArch64::X10: return AArch64::W10; + case AArch64::X11: return AArch64::W11; + case AArch64::X12: return AArch64::W12; + case AArch64::X13: return AArch64::W13; + case AArch64::X14: return AArch64::W14; + case AArch64::X15: return AArch64::W15; + case AArch64::X16: return AArch64::W16; + case AArch64::X17: return AArch64::W17; + case AArch64::X18: return AArch64::W18; + case AArch64::X19: return AArch64::W19; + case AArch64::X20: return AArch64::W20; + case AArch64::X21: return AArch64::W21; + case AArch64::X22: return AArch64::W22; + case AArch64::X23: return AArch64::W23; + case AArch64::X24: return AArch64::W24; + case AArch64::X25: return AArch64::W25; + case AArch64::X26: return AArch64::W26; + case AArch64::X27: return AArch64::W27; + case AArch64::X28: return AArch64::W28; + case AArch64::FP: return AArch64::W29; + case AArch64::LR: return AArch64::W30; + case AArch64::SP: return AArch64::WSP; + case AArch64::XZR: return AArch64::WZR; + } + // For anything else, return it unchanged. + return Reg; +} + +inline static unsigned getXRegFromWReg(unsigned Reg) { + switch (Reg) { + case AArch64::W0: return AArch64::X0; + case AArch64::W1: return AArch64::X1; + case AArch64::W2: return AArch64::X2; + case AArch64::W3: return AArch64::X3; + case AArch64::W4: return AArch64::X4; + case AArch64::W5: return AArch64::X5; + case AArch64::W6: return AArch64::X6; + case AArch64::W7: return AArch64::X7; + case AArch64::W8: return AArch64::X8; + case AArch64::W9: return AArch64::X9; + case AArch64::W10: return AArch64::X10; + case AArch64::W11: return AArch64::X11; + case AArch64::W12: return AArch64::X12; + case AArch64::W13: return AArch64::X13; + case AArch64::W14: return AArch64::X14; + case AArch64::W15: return AArch64::X15; + case AArch64::W16: return AArch64::X16; + case AArch64::W17: return AArch64::X17; + case AArch64::W18: return AArch64::X18; + case AArch64::W19: return AArch64::X19; + case AArch64::W20: return AArch64::X20; + case AArch64::W21: return AArch64::X21; + case AArch64::W22: return AArch64::X22; + case AArch64::W23: return AArch64::X23; + case AArch64::W24: return AArch64::X24; + case AArch64::W25: return AArch64::X25; + case AArch64::W26: return AArch64::X26; + case AArch64::W27: return AArch64::X27; + case AArch64::W28: return AArch64::X28; + case AArch64::W29: return AArch64::FP; + case AArch64::W30: return AArch64::LR; + case AArch64::WSP: return AArch64::SP; + case AArch64::WZR: return AArch64::XZR; + } + // For anything else, return it unchanged. + return Reg; +} + +static inline unsigned getBRegFromDReg(unsigned Reg) { + switch (Reg) { + case AArch64::D0: return AArch64::B0; + case AArch64::D1: return AArch64::B1; + case AArch64::D2: return AArch64::B2; + case AArch64::D3: return AArch64::B3; + case AArch64::D4: return AArch64::B4; + case AArch64::D5: return AArch64::B5; + case AArch64::D6: return AArch64::B6; + case AArch64::D7: return AArch64::B7; + case AArch64::D8: return AArch64::B8; + case AArch64::D9: return AArch64::B9; + case AArch64::D10: return AArch64::B10; + case AArch64::D11: return AArch64::B11; + case AArch64::D12: return AArch64::B12; + case AArch64::D13: return AArch64::B13; + case AArch64::D14: return AArch64::B14; + case AArch64::D15: return AArch64::B15; + case AArch64::D16: return AArch64::B16; + case AArch64::D17: return AArch64::B17; + case AArch64::D18: return AArch64::B18; + case AArch64::D19: return AArch64::B19; + case AArch64::D20: return AArch64::B20; + case AArch64::D21: return AArch64::B21; + case AArch64::D22: return AArch64::B22; + case AArch64::D23: return AArch64::B23; + case AArch64::D24: return AArch64::B24; + case AArch64::D25: return AArch64::B25; + case AArch64::D26: return AArch64::B26; + case AArch64::D27: return AArch64::B27; + case AArch64::D28: return AArch64::B28; + case AArch64::D29: return AArch64::B29; + case AArch64::D30: return AArch64::B30; + case AArch64::D31: return AArch64::B31; + } + // For anything else, return it unchanged. + return Reg; +} + + +static inline unsigned getDRegFromBReg(unsigned Reg) { + switch (Reg) { + case AArch64::B0: return AArch64::D0; + case AArch64::B1: return AArch64::D1; + case AArch64::B2: return AArch64::D2; + case AArch64::B3: return AArch64::D3; + case AArch64::B4: return AArch64::D4; + case AArch64::B5: return AArch64::D5; + case AArch64::B6: return AArch64::D6; + case AArch64::B7: return AArch64::D7; + case AArch64::B8: return AArch64::D8; + case AArch64::B9: return AArch64::D9; + case AArch64::B10: return AArch64::D10; + case AArch64::B11: return AArch64::D11; + case AArch64::B12: return AArch64::D12; + case AArch64::B13: return AArch64::D13; + case AArch64::B14: return AArch64::D14; + case AArch64::B15: return AArch64::D15; + case AArch64::B16: return AArch64::D16; + case AArch64::B17: return AArch64::D17; + case AArch64::B18: return AArch64::D18; + case AArch64::B19: return AArch64::D19; + case AArch64::B20: return AArch64::D20; + case AArch64::B21: return AArch64::D21; + case AArch64::B22: return AArch64::D22; + case AArch64::B23: return AArch64::D23; + case AArch64::B24: return AArch64::D24; + case AArch64::B25: return AArch64::D25; + case AArch64::B26: return AArch64::D26; + case AArch64::B27: return AArch64::D27; + case AArch64::B28: return AArch64::D28; + case AArch64::B29: return AArch64::D29; + case AArch64::B30: return AArch64::D30; + case AArch64::B31: return AArch64::D31; + } + // For anything else, return it unchanged. + return Reg; +} + +namespace AArch64CC { + +// The CondCodes constants map directly to the 4-bit encoding of the condition +// field for predicated instructions. +enum CondCode { // Meaning (integer) Meaning (floating-point) + EQ = 0x0, // Equal Equal + NE = 0x1, // Not equal Not equal, or unordered + HS = 0x2, // Unsigned higher or same >, ==, or unordered + LO = 0x3, // Unsigned lower Less than + MI = 0x4, // Minus, negative Less than + PL = 0x5, // Plus, positive or zero >, ==, or unordered + VS = 0x6, // Overflow Unordered + VC = 0x7, // No overflow Not unordered + HI = 0x8, // Unsigned higher Greater than, or unordered + LS = 0x9, // Unsigned lower or same Less than or equal + GE = 0xa, // Greater than or equal Greater than or equal + LT = 0xb, // Less than Less than, or unordered + GT = 0xc, // Greater than Greater than + LE = 0xd, // Less than or equal <, ==, or unordered + AL = 0xe, // Always (unconditional) Always (unconditional) + NV = 0xf, // Always (unconditional) Always (unconditional) + // Note the NV exists purely to disassemble 0b1111. Execution is "always". + Invalid +}; + +inline static const char *getCondCodeName(CondCode Code) { + switch (Code) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return "eq"; + case NE: return "ne"; + case HS: return "hs"; + case LO: return "lo"; + case MI: return "mi"; + case PL: return "pl"; + case VS: return "vs"; + case VC: return "vc"; + case HI: return "hi"; + case LS: return "ls"; + case GE: return "ge"; + case LT: return "lt"; + case GT: return "gt"; + case LE: return "le"; + case AL: return "al"; + case NV: return "nv"; + } +} + +inline static CondCode getInvertedCondCode(CondCode Code) { + // To reverse a condition it's necessary to only invert the low bit: + + return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1); +} + +/// Given a condition code, return NZCV flags that would satisfy that condition. +/// The flag bits are in the format expected by the ccmp instructions. +/// Note that many different flag settings can satisfy a given condition code, +/// this function just returns one of them. +inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { + // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7. + enum { N = 8, Z = 4, C = 2, V = 1 }; + switch (Code) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return Z; // Z == 1 + case NE: return 0; // Z == 0 + case HS: return C; // C == 1 + case LO: return 0; // C == 0 + case MI: return N; // N == 1 + case PL: return 0; // N == 0 + case VS: return V; // V == 1 + case VC: return 0; // V == 0 + case HI: return C; // C == 1 && Z == 0 + case LS: return 0; // C == 0 || Z == 1 + case GE: return 0; // N == V + case LT: return N; // N != V + case GT: return 0; // Z == 0 && N == V + case LE: return Z; // Z == 1 || N != V + } +} +} // end namespace AArch64CC + +/// Instances of this class can perform bidirectional mapping from random +/// identifier strings to operand encodings. For example "MSR" takes a named +/// system-register which must be encoded somehow and decoded for printing. This +/// central location means that the information for those transformations is not +/// duplicated and remains in sync. +/// +/// FIXME: currently the algorithm is a completely unoptimised linear +/// search. Obviously this could be improved, but we would probably want to work +/// out just how often these instructions are emitted before working on it. It +/// might even be optimal to just reorder the tables for the common instructions +/// rather than changing the algorithm. +struct AArch64NamedImmMapper { + struct Mapping { + const char *Name; + uint32_t Value; + // Set of features this mapping is available for + // Zero value of FeatureBitSet means the mapping is always available + FeatureBitset FeatureBitSet; + + bool isNameEqual(std::string Other, + const FeatureBitset& FeatureBits) const { + if (FeatureBitSet.any() && + (FeatureBitSet & FeatureBits).none()) + return false; + return Name == Other; + } + + bool isValueEqual(uint32_t Other, + const FeatureBitset& FeatureBits) const { + if (FeatureBitSet.any() && + (FeatureBitSet & FeatureBits).none()) + return false; + return Value == Other; + } + }; + + template<int N> + AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm) + : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {} + + // Maps value to string, depending on availability for FeatureBits given + StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits, + bool &Valid) const; + // Maps string to value, depending on availability for FeatureBits given + uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, + bool &Valid) const; + + /// Many of the instructions allow an alternative assembly form consisting of + /// a simple immediate. Currently the only valid forms are ranges [0, N) where + /// N being 0 indicates no immediate syntax-form is allowed. + bool validImm(uint32_t Value) const; +protected: + const Mapping *Mappings; + size_t NumMappings; + uint32_t TooBigImm; +}; + +namespace AArch64AT { + enum ATValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + S1E1R = 0x43c0, // 01 000 0111 1000 000 + S1E2R = 0x63c0, // 01 100 0111 1000 000 + S1E3R = 0x73c0, // 01 110 0111 1000 000 + S1E1W = 0x43c1, // 01 000 0111 1000 001 + S1E2W = 0x63c1, // 01 100 0111 1000 001 + S1E3W = 0x73c1, // 01 110 0111 1000 001 + S1E0R = 0x43c2, // 01 000 0111 1000 010 + S1E0W = 0x43c3, // 01 000 0111 1000 011 + S12E1R = 0x63c4, // 01 100 0111 1000 100 + S12E1W = 0x63c5, // 01 100 0111 1000 101 + S12E0R = 0x63c6, // 01 100 0111 1000 110 + S12E0W = 0x63c7, // 01 100 0111 1000 111 + S1E1RP = 0x43c8, // 01 000 0111 1001 000 + S1E1WP = 0x43c9 // 01 000 0111 1001 001 + }; + + struct ATMapper : AArch64NamedImmMapper { + const static Mapping ATMappings[]; + + ATMapper(); + }; + +} +namespace AArch64DB { + enum DBValues { + Invalid = -1, + OSHLD = 0x1, + OSHST = 0x2, + OSH = 0x3, + NSHLD = 0x5, + NSHST = 0x6, + NSH = 0x7, + ISHLD = 0x9, + ISHST = 0xa, + ISH = 0xb, + LD = 0xd, + ST = 0xe, + SY = 0xf + }; + + struct DBarrierMapper : AArch64NamedImmMapper { + const static Mapping DBarrierMappings[]; + + DBarrierMapper(); + }; +} + +namespace AArch64DC { + enum DCValues { + Invalid = -1, // Op1 CRn CRm Op2 + ZVA = 0x5ba1, // 01 011 0111 0100 001 + IVAC = 0x43b1, // 01 000 0111 0110 001 + ISW = 0x43b2, // 01 000 0111 0110 010 + CVAC = 0x5bd1, // 01 011 0111 1010 001 + CSW = 0x43d2, // 01 000 0111 1010 010 + CVAU = 0x5bd9, // 01 011 0111 1011 001 + CIVAC = 0x5bf1, // 01 011 0111 1110 001 + CISW = 0x43f2 // 01 000 0111 1110 010 + }; + + struct DCMapper : AArch64NamedImmMapper { + const static Mapping DCMappings[]; + + DCMapper(); + }; + +} + +namespace AArch64IC { + enum ICValues { + Invalid = -1, // Op1 CRn CRm Op2 + IALLUIS = 0x0388, // 000 0111 0001 000 + IALLU = 0x03a8, // 000 0111 0101 000 + IVAU = 0x1ba9 // 011 0111 0101 001 + }; + + + struct ICMapper : AArch64NamedImmMapper { + const static Mapping ICMappings[]; + + ICMapper(); + }; + + static inline bool NeedsRegister(ICValues Val) { + return Val == IVAU; + } +} + +namespace AArch64ISB { + enum ISBValues { + Invalid = -1, + SY = 0xf + }; + struct ISBMapper : AArch64NamedImmMapper { + const static Mapping ISBMappings[]; + + ISBMapper(); + }; +} + +namespace AArch64PRFM { + enum PRFMValues { + Invalid = -1, + PLDL1KEEP = 0x00, + PLDL1STRM = 0x01, + PLDL2KEEP = 0x02, + PLDL2STRM = 0x03, + PLDL3KEEP = 0x04, + PLDL3STRM = 0x05, + PLIL1KEEP = 0x08, + PLIL1STRM = 0x09, + PLIL2KEEP = 0x0a, + PLIL2STRM = 0x0b, + PLIL3KEEP = 0x0c, + PLIL3STRM = 0x0d, + PSTL1KEEP = 0x10, + PSTL1STRM = 0x11, + PSTL2KEEP = 0x12, + PSTL2STRM = 0x13, + PSTL3KEEP = 0x14, + PSTL3STRM = 0x15 + }; + + struct PRFMMapper : AArch64NamedImmMapper { + const static Mapping PRFMMappings[]; + + PRFMMapper(); + }; +} + +namespace AArch64PState { + enum PStateValues { + Invalid = -1, + SPSel = 0x05, + DAIFSet = 0x1e, + DAIFClr = 0x1f, + + // v8.1a "Privileged Access Never" extension-specific PStates + PAN = 0x04, + + // v8.2a "User Access Override" extension-specific PStates + UAO = 0x03 + }; + + struct PStateMapper : AArch64NamedImmMapper { + const static Mapping PStateMappings[]; + + PStateMapper(); + }; + +} + +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + +namespace AArch64SE { + enum ShiftExtSpecifiers { + Invalid = -1, + LSL, + MSL, + LSR, + ASR, + ROR, + + UXTB, + UXTH, + UXTW, + UXTX, + + SXTB, + SXTH, + SXTW, + SXTX + }; +} + +namespace AArch64Layout { + enum VectorLayout { + Invalid = -1, + VL_8B, + VL_4H, + VL_2S, + VL_1D, + + VL_16B, + VL_8H, + VL_4S, + VL_2D, + + // Bare layout for the 128-bit vector + // (only show ".b", ".h", ".s", ".d" without vector number) + VL_B, + VL_H, + VL_S, + VL_D + }; +} + +inline static const char * +AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) { + switch (Layout) { + case AArch64Layout::VL_8B: return ".8b"; + case AArch64Layout::VL_4H: return ".4h"; + case AArch64Layout::VL_2S: return ".2s"; + case AArch64Layout::VL_1D: return ".1d"; + case AArch64Layout::VL_16B: return ".16b"; + case AArch64Layout::VL_8H: return ".8h"; + case AArch64Layout::VL_4S: return ".4s"; + case AArch64Layout::VL_2D: return ".2d"; + case AArch64Layout::VL_B: return ".b"; + case AArch64Layout::VL_H: return ".h"; + case AArch64Layout::VL_S: return ".s"; + case AArch64Layout::VL_D: return ".d"; + default: llvm_unreachable("Unknown Vector Layout"); + } +} + +inline static AArch64Layout::VectorLayout +AArch64StringToVectorLayout(StringRef LayoutStr) { + return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr) + .Case(".8b", AArch64Layout::VL_8B) + .Case(".4h", AArch64Layout::VL_4H) + .Case(".2s", AArch64Layout::VL_2S) + .Case(".1d", AArch64Layout::VL_1D) + .Case(".16b", AArch64Layout::VL_16B) + .Case(".8h", AArch64Layout::VL_8H) + .Case(".4s", AArch64Layout::VL_4S) + .Case(".2d", AArch64Layout::VL_2D) + .Case(".b", AArch64Layout::VL_B) + .Case(".h", AArch64Layout::VL_H) + .Case(".s", AArch64Layout::VL_S) + .Case(".d", AArch64Layout::VL_D) + .Default(AArch64Layout::Invalid); +} + +namespace AArch64SysReg { + enum SysRegROValues { + MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 + DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000 + MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000 + OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100 + DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110 + PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110 + PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111 + MIDR_EL1 = 0xc000, // 11 000 0000 0000 000 + CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000 + CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001 + CTR_EL0 = 0xd801, // 11 011 0000 0000 001 + MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101 + REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110 + AIDR_EL1 = 0xc807, // 11 001 0000 0000 111 + DCZID_EL0 = 0xd807, // 11 011 0000 0000 111 + ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000 + ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001 + ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010 + ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011 + ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100 + ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101 + ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110 + ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111 + ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000 + ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001 + ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010 + ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011 + ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100 + ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101 + ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000 + ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001 + ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000 + ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001 + ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100 + ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101 + ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000 + ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 + ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 + ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 + ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 + MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 + MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 + MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 + RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001 + RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001 + RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001 + ISR_EL1 = 0xc608, // 11 000 1100 0001 000 + CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001 + CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010 + ID_MMFR4_EL1 = 0xc016, // 11 000 0000 0010 110 + + // Trace registers + TRCSTATR = 0x8818, // 10 001 0000 0011 000 + TRCIDR8 = 0x8806, // 10 001 0000 0000 110 + TRCIDR9 = 0x880e, // 10 001 0000 0001 110 + TRCIDR10 = 0x8816, // 10 001 0000 0010 110 + TRCIDR11 = 0x881e, // 10 001 0000 0011 110 + TRCIDR12 = 0x8826, // 10 001 0000 0100 110 + TRCIDR13 = 0x882e, // 10 001 0000 0101 110 + TRCIDR0 = 0x8847, // 10 001 0000 1000 111 + TRCIDR1 = 0x884f, // 10 001 0000 1001 111 + TRCIDR2 = 0x8857, // 10 001 0000 1010 111 + TRCIDR3 = 0x885f, // 10 001 0000 1011 111 + TRCIDR4 = 0x8867, // 10 001 0000 1100 111 + TRCIDR5 = 0x886f, // 10 001 0000 1101 111 + TRCIDR6 = 0x8877, // 10 001 0000 1110 111 + TRCIDR7 = 0x887f, // 10 001 0000 1111 111 + TRCOSLSR = 0x888c, // 10 001 0001 0001 100 + TRCPDSR = 0x88ac, // 10 001 0001 0101 100 + TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110 + TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110 + TRCLSR = 0x8bee, // 10 001 0111 1101 110 + TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110 + TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110 + TRCDEVID = 0x8b97, // 10 001 0111 0010 111 + TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111 + TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111 + TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111 + TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111 + TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111 + TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111 + TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111 + TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111 + TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111 + TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111 + TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111 + TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111 + TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111 + + // GICv3 registers + ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000 + ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000 + ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010 + ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010 + ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011 + ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001 + ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011 + ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101 + }; + + enum SysRegWOValues { + DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000 + OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100 + PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100 + + // Trace Registers + TRCOSLAR = 0x8884, // 10 001 0001 0000 100 + TRCLAR = 0x8be6, // 10 001 0111 1100 110 + + // GICv3 registers + ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001 + ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001 + ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001 + ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101 + ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110 + ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111 + }; + + enum SysRegValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010 + OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010 + TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000 + MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000 + MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010 + DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000 + OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010 + DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000 + DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100 + DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100 + DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100 + DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100 + DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100 + DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100 + DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100 + DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100 + DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100 + DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100 + DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100 + DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100 + DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100 + DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100 + DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100 + DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100 + DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101 + DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101 + DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101 + DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101 + DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101 + DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101 + DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101 + DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101 + DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101 + DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101 + DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101 + DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101 + DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101 + DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101 + DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101 + DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101 + DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110 + DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110 + DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110 + DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110 + DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110 + DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110 + DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110 + DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110 + DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110 + DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110 + DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110 + DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110 + DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110 + DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110 + DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110 + DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110 + DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111 + DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111 + DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111 + DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111 + DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111 + DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111 + DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111 + DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111 + DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111 + DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111 + DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111 + DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111 + DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111 + DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111 + DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111 + DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111 + TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000 + OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100 + DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100 + DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110 + DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110 + CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000 + VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000 + VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101 + CPACR_EL1 = 0xc082, // 11 000 0001 0000 010 + SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000 + SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000 + SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000 + ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001 + ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001 + ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001 + HCR_EL2 = 0xe088, // 11 100 0001 0001 000 + SCR_EL3 = 0xf088, // 11 110 0001 0001 000 + MDCR_EL2 = 0xe089, // 11 100 0001 0001 001 + SDER32_EL3 = 0xf089, // 11 110 0001 0001 001 + CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010 + CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010 + HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011 + HACR_EL2 = 0xe08f, // 11 100 0001 0001 111 + MDCR_EL3 = 0xf099, // 11 110 0001 0011 001 + TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000 + TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000 + TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000 + TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001 + TCR_EL1 = 0xc102, // 11 000 0010 0000 010 + TCR_EL2 = 0xe102, // 11 100 0010 0000 010 + TCR_EL3 = 0xf102, // 11 110 0010 0000 010 + VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000 + VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010 + DACR32_EL2 = 0xe180, // 11 100 0011 0000 000 + SPSR_EL1 = 0xc200, // 11 000 0100 0000 000 + SPSR_EL2 = 0xe200, // 11 100 0100 0000 000 + SPSR_EL3 = 0xf200, // 11 110 0100 0000 000 + ELR_EL1 = 0xc201, // 11 000 0100 0000 001 + ELR_EL2 = 0xe201, // 11 100 0100 0000 001 + ELR_EL3 = 0xf201, // 11 110 0100 0000 001 + SP_EL0 = 0xc208, // 11 000 0100 0001 000 + SP_EL1 = 0xe208, // 11 100 0100 0001 000 + SP_EL2 = 0xf208, // 11 110 0100 0001 000 + SPSel = 0xc210, // 11 000 0100 0010 000 + NZCV = 0xda10, // 11 011 0100 0010 000 + DAIF = 0xda11, // 11 011 0100 0010 001 + CurrentEL = 0xc212, // 11 000 0100 0010 010 + SPSR_irq = 0xe218, // 11 100 0100 0011 000 + SPSR_abt = 0xe219, // 11 100 0100 0011 001 + SPSR_und = 0xe21a, // 11 100 0100 0011 010 + SPSR_fiq = 0xe21b, // 11 100 0100 0011 011 + FPCR = 0xda20, // 11 011 0100 0100 000 + FPSR = 0xda21, // 11 011 0100 0100 001 + DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000 + DLR_EL0 = 0xda29, // 11 011 0100 0101 001 + IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001 + AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000 + AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000 + AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000 + AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001 + AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001 + AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001 + ESR_EL1 = 0xc290, // 11 000 0101 0010 000 + ESR_EL2 = 0xe290, // 11 100 0101 0010 000 + ESR_EL3 = 0xf290, // 11 110 0101 0010 000 + FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000 + FAR_EL1 = 0xc300, // 11 000 0110 0000 000 + FAR_EL2 = 0xe300, // 11 100 0110 0000 000 + FAR_EL3 = 0xf300, // 11 110 0110 0000 000 + HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100 + PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000 + PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000 + PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001 + PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010 + PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011 + PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101 + PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000 + PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001 + PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010 + PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000 + PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001 + PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010 + PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011 + MAIR_EL1 = 0xc510, // 11 000 1010 0010 000 + MAIR_EL2 = 0xe510, // 11 100 1010 0010 000 + MAIR_EL3 = 0xf510, // 11 110 1010 0010 000 + AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000 + AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000 + AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000 + VBAR_EL1 = 0xc600, // 11 000 1100 0000 000 + VBAR_EL2 = 0xe600, // 11 100 1100 0000 000 + VBAR_EL3 = 0xf600, // 11 110 1100 0000 000 + RMR_EL1 = 0xc602, // 11 000 1100 0000 010 + RMR_EL2 = 0xe602, // 11 100 1100 0000 010 + RMR_EL3 = 0xf602, // 11 110 1100 0000 010 + CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001 + TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010 + TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010 + TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010 + TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011 + TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100 + CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000 + CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011 + CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000 + CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000 + CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000 + CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000 + CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000 + CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001 + CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001 + CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001 + CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010 + CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010 + CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010 + CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000 + CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001 + CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010 + PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000 + PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001 + PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010 + PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011 + PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100 + PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101 + PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110 + PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111 + PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000 + PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001 + PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010 + PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011 + PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100 + PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101 + PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110 + PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111 + PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000 + PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001 + PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010 + PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011 + PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100 + PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101 + PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110 + PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111 + PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000 + PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001 + PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010 + PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011 + PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100 + PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101 + PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110 + PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111 + PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000 + PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001 + PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010 + PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011 + PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100 + PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101 + PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110 + PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111 + PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000 + PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001 + PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010 + PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011 + PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100 + PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101 + PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110 + PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111 + PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000 + PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001 + PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010 + PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011 + PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100 + PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101 + PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110 + PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111 + PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000 + PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001 + PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010 + PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011 + PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100 + PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101 + PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110 + + // Trace registers + TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000 + TRCPROCSELR = 0x8810, // 10 001 0000 0010 000 + TRCCONFIGR = 0x8820, // 10 001 0000 0100 000 + TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000 + TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000 + TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000 + TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000 + TRCTSCTLR = 0x8860, // 10 001 0000 1100 000 + TRCSYNCPR = 0x8868, // 10 001 0000 1101 000 + TRCCCCTLR = 0x8870, // 10 001 0000 1110 000 + TRCBBCTLR = 0x8878, // 10 001 0000 1111 000 + TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001 + TRCQCTLR = 0x8809, // 10 001 0000 0001 001 + TRCVICTLR = 0x8802, // 10 001 0000 0000 010 + TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010 + TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010 + TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010 + TRCVDCTLR = 0x8842, // 10 001 0000 1000 010 + TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010 + TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010 + TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100 + TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100 + TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100 + TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100 + TRCSEQSTR = 0x883c, // 10 001 0000 0111 100 + TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100 + TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101 + TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101 + TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101 + TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101 + TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101 + TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101 + TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101 + TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101 + TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101 + TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101 + TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101 + TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101 + TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111 + TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111 + TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111 + TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111 + TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111 + TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111 + TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111 + TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111 + TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000 + TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000 + TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000 + TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000 + TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000 + TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000 + TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000 + TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000 + TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000 + TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000 + TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000 + TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000 + TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000 + TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000 + TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001 + TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001 + TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001 + TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001 + TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001 + TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001 + TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001 + TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001 + TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001 + TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001 + TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001 + TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001 + TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001 + TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001 + TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001 + TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001 + TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010 + TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010 + TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010 + TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010 + TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010 + TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010 + TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010 + TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010 + TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010 + TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010 + TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010 + TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010 + TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010 + TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010 + TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010 + TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010 + TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011 + TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011 + TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011 + TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011 + TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011 + TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011 + TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011 + TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011 + TRCPDCR = 0x88a4, // 10 001 0001 0100 100 + TRCACVR0 = 0x8900, // 10 001 0010 0000 000 + TRCACVR1 = 0x8910, // 10 001 0010 0010 000 + TRCACVR2 = 0x8920, // 10 001 0010 0100 000 + TRCACVR3 = 0x8930, // 10 001 0010 0110 000 + TRCACVR4 = 0x8940, // 10 001 0010 1000 000 + TRCACVR5 = 0x8950, // 10 001 0010 1010 000 + TRCACVR6 = 0x8960, // 10 001 0010 1100 000 + TRCACVR7 = 0x8970, // 10 001 0010 1110 000 + TRCACVR8 = 0x8901, // 10 001 0010 0000 001 + TRCACVR9 = 0x8911, // 10 001 0010 0010 001 + TRCACVR10 = 0x8921, // 10 001 0010 0100 001 + TRCACVR11 = 0x8931, // 10 001 0010 0110 001 + TRCACVR12 = 0x8941, // 10 001 0010 1000 001 + TRCACVR13 = 0x8951, // 10 001 0010 1010 001 + TRCACVR14 = 0x8961, // 10 001 0010 1100 001 + TRCACVR15 = 0x8971, // 10 001 0010 1110 001 + TRCACATR0 = 0x8902, // 10 001 0010 0000 010 + TRCACATR1 = 0x8912, // 10 001 0010 0010 010 + TRCACATR2 = 0x8922, // 10 001 0010 0100 010 + TRCACATR3 = 0x8932, // 10 001 0010 0110 010 + TRCACATR4 = 0x8942, // 10 001 0010 1000 010 + TRCACATR5 = 0x8952, // 10 001 0010 1010 010 + TRCACATR6 = 0x8962, // 10 001 0010 1100 010 + TRCACATR7 = 0x8972, // 10 001 0010 1110 010 + TRCACATR8 = 0x8903, // 10 001 0010 0000 011 + TRCACATR9 = 0x8913, // 10 001 0010 0010 011 + TRCACATR10 = 0x8923, // 10 001 0010 0100 011 + TRCACATR11 = 0x8933, // 10 001 0010 0110 011 + TRCACATR12 = 0x8943, // 10 001 0010 1000 011 + TRCACATR13 = 0x8953, // 10 001 0010 1010 011 + TRCACATR14 = 0x8963, // 10 001 0010 1100 011 + TRCACATR15 = 0x8973, // 10 001 0010 1110 011 + TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100 + TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100 + TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100 + TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100 + TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101 + TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101 + TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101 + TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101 + TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110 + TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110 + TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110 + TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110 + TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111 + TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111 + TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111 + TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111 + TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000 + TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000 + TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000 + TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000 + TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000 + TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000 + TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000 + TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000 + TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001 + TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001 + TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001 + TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001 + TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001 + TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001 + TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001 + TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001 + TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010 + TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010 + TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010 + TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010 + TRCITCTRL = 0x8b84, // 10 001 0111 0000 100 + TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110 + TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110 + + // GICv3 registers + ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011 + ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011 + ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000 + ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100 + ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100 + ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101 + ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101 + ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101 + ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110 + ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111 + ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111 + ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000 + ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100 + ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101 + ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110 + ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111 + ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000 + ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001 + ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010 + ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011 + ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000 + ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001 + ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010 + ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011 + ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000 + ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001 + ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010 + ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011 + ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000 + ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010 + ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111 + ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100 + ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000 + ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001 + ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010 + ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011 + ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100 + ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101 + ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110 + ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111 + ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000 + ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001 + ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010 + ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011 + ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100 + ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101 + ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110 + ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111 + + // v8.1a "Privileged Access Never" extension-specific system registers + PAN = 0xc213, // 11 000 0100 0010 011 + + // v8.1a "Limited Ordering Regions" extension-specific system registers + LORSA_EL1 = 0xc520, // 11 000 1010 0100 000 + LOREA_EL1 = 0xc521, // 11 000 1010 0100 001 + LORN_EL1 = 0xc522, // 11 000 1010 0100 010 + LORC_EL1 = 0xc523, // 11 000 1010 0100 011 + LORID_EL1 = 0xc527, // 11 000 1010 0100 111 + + // v8.1a "Virtualization host extensions" system registers + TTBR1_EL2 = 0xe101, // 11 100 0010 0000 001 + CONTEXTIDR_EL2 = 0xe681, // 11 100 1101 0000 001 + CNTHV_TVAL_EL2 = 0xe718, // 11 100 1110 0011 000 + CNTHV_CVAL_EL2 = 0xe71a, // 11 100 1110 0011 010 + CNTHV_CTL_EL2 = 0xe719, // 11 100 1110 0011 001 + SCTLR_EL12 = 0xe880, // 11 101 0001 0000 000 + CPACR_EL12 = 0xe882, // 11 101 0001 0000 010 + TTBR0_EL12 = 0xe900, // 11 101 0010 0000 000 + TTBR1_EL12 = 0xe901, // 11 101 0010 0000 001 + TCR_EL12 = 0xe902, // 11 101 0010 0000 010 + AFSR0_EL12 = 0xea88, // 11 101 0101 0001 000 + AFSR1_EL12 = 0xea89, // 11 101 0101 0001 001 + ESR_EL12 = 0xea90, // 11 101 0101 0010 000 + FAR_EL12 = 0xeb00, // 11 101 0110 0000 000 + MAIR_EL12 = 0xed10, // 11 101 1010 0010 000 + AMAIR_EL12 = 0xed18, // 11 101 1010 0011 000 + VBAR_EL12 = 0xee00, // 11 101 1100 0000 000 + CONTEXTIDR_EL12 = 0xee81, // 11 101 1101 0000 001 + CNTKCTL_EL12 = 0xef08, // 11 101 1110 0001 000 + CNTP_TVAL_EL02 = 0xef10, // 11 101 1110 0010 000 + CNTP_CTL_EL02 = 0xef11, // 11 101 1110 0010 001 + CNTP_CVAL_EL02 = 0xef12, // 11 101 1110 0010 010 + CNTV_TVAL_EL02 = 0xef18, // 11 101 1110 0011 000 + CNTV_CTL_EL02 = 0xef19, // 11 101 1110 0011 001 + CNTV_CVAL_EL02 = 0xef1a, // 11 101 1110 0011 010 + SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 + ELR_EL12 = 0xea01, // 11 101 0100 0000 001 + + // v8.2a registers + UAO = 0xc214, // 11 000 0100 0010 100 + + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + + // Cyclone specific system registers + CPM_IOACC_CTL_EL3 = 0xff90, + }; + + // Note that these do not inherit from AArch64NamedImmMapper. This class is + // sufficiently different in its behaviour that I don't believe it's worth + // burdening the common AArch64NamedImmMapper with abstractions only needed in + // this one case. + struct SysRegMapper { + static const AArch64NamedImmMapper::Mapping SysRegMappings[]; + + const AArch64NamedImmMapper::Mapping *InstMappings; + size_t NumInstMappings; + + SysRegMapper() { } + uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, + bool &Valid) const; + std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const; + }; + + struct MSRMapper : SysRegMapper { + static const AArch64NamedImmMapper::Mapping MSRMappings[]; + MSRMapper(); + }; + + struct MRSMapper : SysRegMapper { + static const AArch64NamedImmMapper::Mapping MRSMappings[]; + MRSMapper(); + }; + + uint32_t ParseGenericRegister(StringRef Name, bool &Valid); +} + +namespace AArch64TLBI { + enum TLBIValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + IPAS2E1IS = 0x6401, // 01 100 1000 0000 001 + IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101 + VMALLE1IS = 0x4418, // 01 000 1000 0011 000 + ALLE2IS = 0x6418, // 01 100 1000 0011 000 + ALLE3IS = 0x7418, // 01 110 1000 0011 000 + VAE1IS = 0x4419, // 01 000 1000 0011 001 + VAE2IS = 0x6419, // 01 100 1000 0011 001 + VAE3IS = 0x7419, // 01 110 1000 0011 001 + ASIDE1IS = 0x441a, // 01 000 1000 0011 010 + VAAE1IS = 0x441b, // 01 000 1000 0011 011 + ALLE1IS = 0x641c, // 01 100 1000 0011 100 + VALE1IS = 0x441d, // 01 000 1000 0011 101 + VALE2IS = 0x641d, // 01 100 1000 0011 101 + VALE3IS = 0x741d, // 01 110 1000 0011 101 + VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110 + VAALE1IS = 0x441f, // 01 000 1000 0011 111 + IPAS2E1 = 0x6421, // 01 100 1000 0100 001 + IPAS2LE1 = 0x6425, // 01 100 1000 0100 101 + VMALLE1 = 0x4438, // 01 000 1000 0111 000 + ALLE2 = 0x6438, // 01 100 1000 0111 000 + ALLE3 = 0x7438, // 01 110 1000 0111 000 + VAE1 = 0x4439, // 01 000 1000 0111 001 + VAE2 = 0x6439, // 01 100 1000 0111 001 + VAE3 = 0x7439, // 01 110 1000 0111 001 + ASIDE1 = 0x443a, // 01 000 1000 0111 010 + VAAE1 = 0x443b, // 01 000 1000 0111 011 + ALLE1 = 0x643c, // 01 100 1000 0111 100 + VALE1 = 0x443d, // 01 000 1000 0111 101 + VALE2 = 0x643d, // 01 100 1000 0111 101 + VALE3 = 0x743d, // 01 110 1000 0111 101 + VMALLS12E1 = 0x643e, // 01 100 1000 0111 110 + VAALE1 = 0x443f // 01 000 1000 0111 111 + }; + + struct TLBIMapper : AArch64NamedImmMapper { + const static Mapping TLBIMappings[]; + + TLBIMapper(); + }; + + static inline bool NeedsRegister(TLBIValues Val) { + switch (Val) { + case VMALLE1IS: + case ALLE2IS: + case ALLE3IS: + case ALLE1IS: + case VMALLS12E1IS: + case VMALLE1: + case ALLE2: + case ALLE3: + case ALLE1: + case VMALLS12E1: + return false; + default: + return true; + } + } +} + +namespace AArch64II { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // AArch64 Specific MachineOperand flags. + + MO_NO_FLAG, + + MO_FRAGMENT = 0xf, + + /// MO_PAGE - A symbol operand with this flag represents the pc-relative + /// offset of the 4K page containing the symbol. This is used with the + /// ADRP instruction. + MO_PAGE = 1, + + /// MO_PAGEOFF - A symbol operand with this flag represents the offset of + /// that symbol within a 4K page. This offset is added to the page address + /// to produce the complete address. + MO_PAGEOFF = 2, + + /// MO_G3 - A symbol operand with this flag (granule 3) represents the high + /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G3 = 3, + + /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits + /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G2 = 4, + + /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits + /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G1 = 5, + + /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits + /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G0 = 6, + + /// MO_HI12 - This flag indicates that a symbol operand represents the bits + /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left- + /// by-12-bits instruction. + MO_HI12 = 7, + + /// MO_GOT - This flag indicates that a symbol operand represents the + /// address of the GOT entry for the symbol, rather than the address of + /// the symbol itself. + MO_GOT = 0x10, + + /// MO_NC - Indicates whether the linker is expected to check the symbol + /// reference for overflow. For example in an ADRP/ADD pair of relocations + /// the ADRP usually does check, but not the ADD. + MO_NC = 0x20, + + /// MO_TLS - Indicates that the operand being accessed is some kind of + /// thread-local symbol. On Darwin, only one type of thread-local access + /// exists (pre linker-relaxation), but on ELF the TLSModel used for the + /// referee will affect interpretation. + MO_TLS = 0x40, + + /// MO_CONSTPOOL - This flag indicates that a symbol operand represents + /// the address of a constant pool entry for the symbol, rather than the + /// address of the symbol itself. + MO_CONSTPOOL = 0x80 + }; +} // end namespace AArch64II + +} // end namespace llvm + +#endif |