diff options
author | dim <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2015-01-18 16:17:27 +0000 |
commit | 081af4da16b9046c019ca40f64b1fb7ee8c6dca1 (patch) | |
tree | 4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/AArch64 | |
parent | 3c7e7a1538a873b0d3b012ef8811969ac4552c2a (diff) | |
download | FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.zip FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.tar.gz |
Vendor import of llvm RELEASE_360/rc1 tag r226102 (effectively, 3.6.0 RC1):
https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc1@226102
Diffstat (limited to 'lib/Target/AArch64')
76 files changed, 9622 insertions, 1836 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 1c022aa..e96d18b 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -12,13 +12,13 @@ // //===----------------------------------------------------------------------===// -#ifndef TARGET_AArch64_H -#define TARGET_AArch64_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64_H -#include "Utils/AArch64BaseInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/Target/TargetMachine.h" +#include "Utils/AArch64BaseInfo.h" #include "llvm/Support/DataTypes.h" +#include "llvm/Target/TargetMachine.h" namespace llvm { @@ -36,7 +36,10 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); ModulePass *createAArch64PromoteConstantPass(); +FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); +FunctionPass *createAArch64A57FPLoadBalancing(); +FunctionPass *createAArch64A53Fix835769(); /// \brief Creates an ARM-specific Target Transformation Info pass. ImmutablePass * createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp new file mode 100644 index 0000000..852a635 --- /dev/null +++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -0,0 +1,240 @@ +//===-- AArch64A53Fix835769.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass changes code to work around Cortex-A53 erratum 835769. +// It works around it by inserting a nop instruction in code sequences that +// in some circumstances may trigger the erratum. +// It inserts a nop instruction between a sequence of the following 2 classes +// of instructions: +// instr 1: mem-instr (including loads, stores and prefetches). +// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769" + +STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769"); + +//===----------------------------------------------------------------------===// +// Helper functions + +// Is the instruction a match for the instruction that comes first in the +// sequence of instructions that can trigger the erratum? +static bool isFirstInstructionInSequence(MachineInstr *MI) { + // Must return true if this instruction is a load, a store or a prefetch. + switch (MI->getOpcode()) { + case AArch64::PRFMl: + case AArch64::PRFMroW: + case AArch64::PRFMroX: + case AArch64::PRFMui: + case AArch64::PRFUMi: + return true; + default: + return (MI->mayLoad() || MI->mayStore()); + } +} + +// Is the instruction a match for the instruction that comes second in the +// sequence that can trigger the erratum? +static bool isSecondInstructionInSequence(MachineInstr *MI) { + // Must return true for non-SIMD integer multiply-accumulates, writing + // to a 64-bit register. + switch (MI->getOpcode()) { + // Erratum cannot be triggered when the destination register is 32 bits, + // therefore only include the following. + case AArch64::MSUBXrrr: + case AArch64::MADDXrrr: + case AArch64::SMADDLrrr: + case AArch64::SMSUBLrrr: + case AArch64::UMADDLrrr: + case AArch64::UMSUBLrrr: + // Erratum can only be triggered by multiply-adds, not by regular + // non-accumulating multiplies, i.e. when Ra=XZR='11111' + return MI->getOperand(3).getReg() != AArch64::XZR; + default: + return false; + } +} + + +//===----------------------------------------------------------------------===// + +namespace { +class AArch64A53Fix835769 : public MachineFunctionPass { + const AArch64InstrInfo *TII; + +public: + static char ID; + explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "Workaround A53 erratum 835769 pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB); +}; +char AArch64A53Fix835769::ID = 0; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +bool +AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { + const TargetMachine &TM = F.getTarget(); + + bool Changed = false; + DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n"); + + TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo(); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB); + } + + return Changed; +} + +// Return the block that was fallen through to get to MBB, if any, +// otherwise nullptr. +static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + // Get the previous machine basic block in the function. + MachineFunction::iterator MBBI = *MBB; + + // Can't go off top of function. + if (MBBI == MBB->getParent()->begin()) + return nullptr; + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 2> Cond; + + MachineBasicBlock *PrevBB = std::prev(MBBI); + for (MachineBasicBlock *S : MBB->predecessors()) + if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && + !TBB && !FBB) + return S; + + return nullptr; +} + +// Iterate through fallen through blocks trying to find a previous non-pseudo if +// there is one, otherwise return nullptr. Only look for instructions in +// previous blocks, not the current block, since we only use this to look at +// previous blocks. +static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB, + const TargetInstrInfo *TII) { + MachineBasicBlock *FMBB = &MBB; + + // If there is no non-pseudo in the current block, loop back around and try + // the previous block (if there is one). + while ((FMBB = getBBFallenThrough(FMBB, TII))) { + for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) { + if (!I->isPseudo()) + return &*I; + } + } + + // There was no previous non-pseudo in the fallen through blocks + return nullptr; +} + +static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI, + const TargetInstrInfo *TII) { + // If we are the first instruction of the block, put the NOP at the end of + // the previous fallthrough block + if (MI == &MBB.front()) { + MachineInstr *I = getLastNonPseudo(MBB, TII); + assert(I && "Expected instruction"); + DebugLoc DL = I->getDebugLoc(); + BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0); + } + else { + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0); + } + + ++NumNopsAdded; +} + +bool +AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); + + // First, scan the basic block, looking for a sequence of 2 instructions + // that match the conditions under which the erratum may trigger. + + // List of terminating instructions in matching sequences + std::vector<MachineInstr*> Sequences; + unsigned Idx = 0; + MachineInstr *PrevInstr = nullptr; + + // Try and find the last non-pseudo instruction in any fallen through blocks, + // if there isn't one, then we use nullptr to represent that. + PrevInstr = getLastNonPseudo(MBB, TII); + + for (auto &MI : MBB) { + MachineInstr *CurrInstr = &MI; + DEBUG(dbgs() << " Examining: " << MI); + if (PrevInstr) { + DEBUG(dbgs() << " PrevInstr: " << *PrevInstr + << " CurrInstr: " << *CurrInstr + << " isFirstInstructionInSequence(PrevInstr): " + << isFirstInstructionInSequence(PrevInstr) << "\n" + << " isSecondInstructionInSequence(CurrInstr): " + << isSecondInstructionInSequence(CurrInstr) << "\n"); + if (isFirstInstructionInSequence(PrevInstr) && + isSecondInstructionInSequence(CurrInstr)) { + DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n"); + Sequences.push_back(CurrInstr); + } + } + if (!CurrInstr->isPseudo()) + PrevInstr = CurrInstr; + ++Idx; + } + + DEBUG(dbgs() << "Scan complete, "<< Sequences.size() + << " occurences of pattern found.\n"); + + // Then update the basic block, inserting nops between the detected sequences. + for (auto &MI : Sequences) { + Changed = true; + insertNopBeforeInstruction(MBB, MI, TII); + } + + return Changed; +} + +// Factory function used by AArch64TargetMachine to add the pass to +// the passmanager. +FunctionPass *llvm::createAArch64A53Fix835769() { + return new AArch64A53Fix835769(); +} diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp new file mode 100644 index 0000000..dd1a1ea --- /dev/null +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -0,0 +1,706 @@ +//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// For best-case performance on Cortex-A57, we should try to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply or +// multiply-accumulate operations. +// +// This pass attempts to detect situations where the register allocation may +// adversely affect this load balancing and to change the registers used so as +// to better utilize the CPU. +// +// Ideally we'd just take each multiply or multiply-accumulate in turn and +// allocate it alternating even or odd registers. However, multiply-accumulates +// are most efficiently performed in the same functional unit as their +// accumulation operand. Therefore this pass tries to find maximal sequences +// ("Chains") of multiply-accumulates linked via their accumulation operand, +// and assign them all the same "color" (oddness/evenness). +// +// This optimization affects S-register and D-register floating point +// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and +// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are +// not affected. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <list> +using namespace llvm; + +#define DEBUG_TYPE "aarch64-a57-fp-load-balancing" + +// Enforce the algorithm to use the scavenged register even when the original +// destination register is the correct color. Used for testing. +static cl::opt<bool> +TransformAll("aarch64-a57-fp-load-balancing-force-all", + cl::desc("Always modify dest registers regardless of color"), + cl::init(false), cl::Hidden); + +// Never use the balance information obtained from chains - return a specific +// color always. Used for testing. +static cl::opt<unsigned> +OverrideBalance("aarch64-a57-fp-load-balancing-override", + cl::desc("Ignore balance information, always return " + "(1: Even, 2: Odd)."), + cl::init(0), cl::Hidden); + +//===----------------------------------------------------------------------===// +// Helper functions + +// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs? +static bool isMul(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AArch64::FMULSrr: + case AArch64::FNMULSrr: + case AArch64::FMULDrr: + case AArch64::FNMULDrr: + return true; + default: + return false; + } +} + +// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs? +static bool isMla(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: + return true; + default: + return false; + } +} + +//===----------------------------------------------------------------------===// + +namespace { +/// A "color", which is either even or odd. Yes, these aren't really colors +/// but the algorithm is conceptually doing two-color graph coloring. +enum class Color { Even, Odd }; +#ifndef NDEBUG +static const char *ColorNames[2] = { "Even", "Odd" }; +#endif + +class Chain; + +class AArch64A57FPLoadBalancing : public MachineFunctionPass { + const AArch64InstrInfo *TII; + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + RegisterClassInfo RCI; + +public: + static char ID; + explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "A57 FP Anti-dependency breaker"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB); + bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB, + int &Balance); + bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB); + int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); + void scanInstruction(MachineInstr *MI, unsigned Idx, + std::map<unsigned, Chain*> &Active, + std::set<std::unique_ptr<Chain>> &AllChains); + void maybeKillChain(MachineOperand &MO, unsigned Idx, + std::map<unsigned, Chain*> &RegChains); + Color getColor(unsigned Register); + Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); +}; +char AArch64A57FPLoadBalancing::ID = 0; + +/// A Chain is a sequence of instructions that are linked together by +/// an accumulation operand. For example: +/// +/// fmul d0<def>, ? +/// fmla d1<def>, ?, ?, d0<kill> +/// fmla d2<def>, ?, ?, d1<kill> +/// +/// There may be other instructions interleaved in the sequence that +/// do not belong to the chain. These other instructions must not use +/// the "chain" register at any point. +/// +/// We currently only support chains where the "chain" operand is killed +/// at each link in the chain for simplicity. +/// A chain has three important instructions - Start, Last and Kill. +/// * The start instruction is the first instruction in the chain. +/// * Last is the final instruction in the chain. +/// * Kill may or may not be defined. If defined, Kill is the instruction +/// where the outgoing value of the Last instruction is killed. +/// This information is important as if we know the outgoing value is +/// killed with no intervening uses, we can safely change its register. +/// +/// Without a kill instruction, we must assume the outgoing value escapes +/// beyond our model and either must not change its register or must +/// create a fixup FMOV to keep the old register value consistent. +/// +class Chain { +public: + /// The important (marker) instructions. + MachineInstr *StartInst, *LastInst, *KillInst; + /// The index, from the start of the basic block, that each marker + /// appears. These are stored so we can do quick interval tests. + unsigned StartInstIdx, LastInstIdx, KillInstIdx; + /// All instructions in the chain. + std::set<MachineInstr*> Insts; + /// True if KillInst cannot be modified. If this is true, + /// we cannot change LastInst's outgoing register. + /// This will be true for tied values and regmasks. + bool KillIsImmutable; + /// The "color" of LastInst. This will be the preferred chain color, + /// as changing intermediate nodes is easy but changing the last + /// instruction can be more tricky. + Color LastColor; + + Chain(MachineInstr *MI, unsigned Idx, Color C) + : StartInst(MI), LastInst(MI), KillInst(nullptr), + StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0), + LastColor(C) { + Insts.insert(MI); + } + + /// Add a new instruction into the chain. The instruction's dest operand + /// has the given color. + void add(MachineInstr *MI, unsigned Idx, Color C) { + LastInst = MI; + LastInstIdx = Idx; + LastColor = C; + assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && + "Chain: broken invariant. A Chain can only be killed after its last " + "def"); + + Insts.insert(MI); + } + + /// Return true if MI is a member of the chain. + bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; } + + /// Return the number of instructions in the chain. + unsigned size() const { + return Insts.size(); + } + + /// Inform the chain that its last active register (the dest register of + /// LastInst) is killed by MI with no intervening uses or defs. + void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) { + KillInst = MI; + KillInstIdx = Idx; + KillIsImmutable = Immutable; + assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && + "Chain: broken invariant. A Chain can only be killed after its last " + "def"); + } + + /// Return the first instruction in the chain. + MachineInstr *getStart() const { return StartInst; } + /// Return the last instruction in the chain. + MachineInstr *getLast() const { return LastInst; } + /// Return the "kill" instruction (as set with setKill()) or NULL. + MachineInstr *getKill() const { return KillInst; } + /// Return an instruction that can be used as an iterator for the end + /// of the chain. This is the maximum of KillInst (if set) and LastInst. + MachineBasicBlock::iterator getEnd() const { + return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); + } + + /// Can the Kill instruction (assuming one exists) be modified? + bool isKillImmutable() const { return KillIsImmutable; } + + /// Return the preferred color of this chain. + Color getPreferredColor() { + if (OverrideBalance != 0) + return OverrideBalance == 1 ? Color::Even : Color::Odd; + return LastColor; + } + + /// Return true if this chain (StartInst..KillInst) overlaps with Other. + bool rangeOverlapsWith(const Chain &Other) const { + unsigned End = KillInst ? KillInstIdx : LastInstIdx; + unsigned OtherEnd = Other.KillInst ? + Other.KillInstIdx : Other.LastInstIdx; + + return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End; + } + + /// Return true if this chain starts before Other. + bool startsBefore(Chain *Other) { + return StartInstIdx < Other->StartInstIdx; + } + + /// Return true if the group will require a fixup MOV at the end. + bool requiresFixup() const { + return (getKill() && isKillImmutable()) || !getKill(); + } + + /// Return a simple string representation of the chain. + std::string str() const { + std::string S; + raw_string_ostream OS(S); + + OS << "{"; + StartInst->print(OS, NULL, true); + OS << " -> "; + LastInst->print(OS, NULL, true); + if (KillInst) { + OS << " (kill @ "; + KillInst->print(OS, NULL, true); + OS << ")"; + } + OS << "}"; + + return OS.str(); + } + +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { + bool Changed = false; + DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n"); + + const TargetMachine &TM = F.getTarget(); + MRI = &F.getRegInfo(); + TRI = F.getRegInfo().getTargetRegisterInfo(); + TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo(); + RCI.runOnMachineFunction(F); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB); + } + + return Changed; +} + +bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); + + // First, scan the basic block producing a set of chains. + + // The currently "active" chains - chains that can be added to and haven't + // been killed yet. This is keyed by register - all chains can only have one + // "link" register between each inst in the chain. + std::map<unsigned, Chain*> ActiveChains; + std::set<std::unique_ptr<Chain>> AllChains; + unsigned Idx = 0; + for (auto &MI : MBB) + scanInstruction(&MI, Idx++, ActiveChains, AllChains); + + DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n"); + + // Group the chains into disjoint sets based on their liveness range. This is + // a poor-man's version of graph coloring. Ideally we'd create an interference + // graph and perform full-on graph coloring on that, but; + // (a) That's rather heavyweight for only two colors. + // (b) We expect multiple disjoint interference regions - in practice the live + // range of chains is quite small and they are clustered between loads + // and stores. + EquivalenceClasses<Chain*> EC; + for (auto &I : AllChains) + EC.insert(I.get()); + + for (auto &I : AllChains) + for (auto &J : AllChains) + if (I != J && I->rangeOverlapsWith(*J)) + EC.unionSets(I.get(), J.get()); + DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n"); + + // Now we assume that every member of an equivalence class interferes + // with every other member of that class, and with no members of other classes. + + // Convert the EquivalenceClasses to a simpler set of sets. + std::vector<std::vector<Chain*> > V; + for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { + std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end()); + if (Cs.empty()) continue; + V.push_back(std::move(Cs)); + } + + // Now we have a set of sets, order them by start address so + // we can iterate over them sequentially. + std::sort(V.begin(), V.end(), + [](const std::vector<Chain*> &A, + const std::vector<Chain*> &B) { + return A.front()->startsBefore(B.front()); + }); + + // As we only have two colors, we can track the global (BB-level) balance of + // odds versus evens. We aim to keep this near zero to keep both execution + // units fed. + // Positive means we're even-heavy, negative we're odd-heavy. + // + // FIXME: If chains have interdependencies, for example: + // mul r0, r1, r2 + // mul r3, r0, r1 + // We do not model this and may color each one differently, assuming we'll + // get ILP when we obviously can't. This hasn't been seen to be a problem + // in practice so far, so we simplify the algorithm by ignoring it. + int Parity = 0; + + for (auto &I : V) + Changed |= colorChainSet(std::move(I), MBB, Parity); + + return Changed; +} + +Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, + std::vector<Chain*> &L) { + if (L.empty()) + return nullptr; + + // We try and get the best candidate from L to color next, given that our + // preferred color is "PreferredColor". L is ordered from larger to smaller + // chains. It is beneficial to color the large chains before the small chains, + // but if we can't find a chain of the maximum length with the preferred color, + // we fuzz the size and look for slightly smaller chains before giving up and + // returning a chain that must be recolored. + + // FIXME: Does this need to be configurable? + const unsigned SizeFuzz = 1; + unsigned MinSize = L.front()->size() - SizeFuzz; + for (auto I = L.begin(), E = L.end(); I != E; ++I) { + if ((*I)->size() <= MinSize) { + // We've gone past the size limit. Return the previous item. + Chain *Ch = *--I; + L.erase(I); + return Ch; + } + + if ((*I)->getPreferredColor() == PreferredColor) { + Chain *Ch = *I; + L.erase(I); + return Ch; + } + } + + // Bailout case - just return the first item. + Chain *Ch = L.front(); + L.erase(L.begin()); + return Ch; +} + +bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, + MachineBasicBlock &MBB, + int &Parity) { + bool Changed = false; + DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n"); + + // Sort by descending size order so that we allocate the most important + // sets first. + // Tie-break equivalent sizes by sorting chains requiring fixups before + // those without fixups. The logic here is that we should look at the + // chains that we cannot change before we look at those we can, + // so the parity counter is updated and we know what color we should + // change them to! + std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) { + if (G1->size() != G2->size()) + return G1->size() > G2->size(); + return G1->requiresFixup() > G2->requiresFixup(); + }); + + Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; + while (Chain *G = getAndEraseNext(PreferredColor, GV)) { + // Start off by assuming we'll color to our own preferred color. + Color C = PreferredColor; + if (Parity == 0) + // But if we really don't care, use the chain's preferred color. + C = G->getPreferredColor(); + + DEBUG(dbgs() << " - Parity=" << Parity << ", Color=" + << ColorNames[(int)C] << "\n"); + + // If we'll need a fixup FMOV, don't bother. Testing has shown that this + // happens infrequently and when it does it has at least a 50% chance of + // slowing code down instead of speeding it up. + if (G->requiresFixup() && C != G->getPreferredColor()) { + C = G->getPreferredColor(); + DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; " + "color remains " << ColorNames[(int)C] << "\n"); + } + + Changed |= colorChain(G, C, MBB); + + Parity += (C == Color::Even) ? G->size() : -G->size(); + PreferredColor = Parity < 0 ? Color::Even : Color::Odd; + } + + return Changed; +} + +int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, + MachineBasicBlock &MBB) { + RegScavenger RS; + RS.enterBasicBlock(&MBB); + RS.forward(MachineBasicBlock::iterator(G->getStart())); + + // Can we find an appropriate register that is available throughout the life + // of the chain? + unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; + BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); + for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); + I != E; ++I) { + RS.forward(I); + AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); + + // Remove any registers clobbered by a regmask or any def register that is + // immediately dead. + for (auto J : I->operands()) { + if (J.isRegMask()) + AvailableRegs.clearBitsNotInMask(J.getRegMask()); + + if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) { + assert(J.isDead() && "Non-dead def should have been removed by now!"); + AvailableRegs.reset(J.getReg()); + } + } + } + + // Make sure we allocate in-order, to get the cheapest registers first. + auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID)); + for (auto Reg : Ord) { + if (!AvailableRegs[Reg]) + continue; + if ((C == Color::Even && (Reg % 2) == 0) || + (C == Color::Odd && (Reg % 2) == 1)) + return Reg; + } + + return -1; +} + +bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, + MachineBasicBlock &MBB) { + bool Changed = false; + DEBUG(dbgs() << " - colorChain(" << G->str() << ", " + << ColorNames[(int)C] << ")\n"); + + // Try and obtain a free register of the right class. Without a register + // to play with we cannot continue. + int Reg = scavengeRegister(G, C, MBB); + if (Reg == -1) { + DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n"); + return false; + } + DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); + + std::map<unsigned, unsigned> Substs; + for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); + I != E; ++I) { + if (!G->contains(I) && + (&*I != G->getKill() || G->isKillImmutable())) + continue; + + // I is a member of G, or I is a mutable instruction that kills G. + + std::vector<unsigned> ToErase; + for (auto &U : I->operands()) { + if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { + unsigned OrigReg = U.getReg(); + U.setReg(Substs[OrigReg]); + if (U.isKill()) + // Don't erase straight away, because there may be other operands + // that also reference this substitution! + ToErase.push_back(OrigReg); + } else if (U.isRegMask()) { + for (auto J : Substs) { + if (U.clobbersPhysReg(J.first)) + ToErase.push_back(J.first); + } + } + } + // Now it's safe to remove the substs identified earlier. + for (auto J : ToErase) + Substs.erase(J); + + // Only change the def if this isn't the last instruction. + if (&*I != G->getKill()) { + MachineOperand &MO = I->getOperand(0); + + bool Change = TransformAll || getColor(MO.getReg()) != C; + if (G->requiresFixup() && &*I == G->getLast()) + Change = false; + + if (Change) { + Substs[MO.getReg()] = Reg; + MO.setReg(Reg); + MRI->setPhysRegUsed(Reg); + + Changed = true; + } + } + } + assert(Substs.size() == 0 && "No substitutions should be left active!"); + + if (G->getKill()) { + DEBUG(dbgs() << " - Kill instruction seen.\n"); + } else { + // We didn't have a kill instruction, but we didn't seem to need to change + // the destination register anyway. + DEBUG(dbgs() << " - Destination register not changed.\n"); + } + return Changed; +} + +void AArch64A57FPLoadBalancing:: +scanInstruction(MachineInstr *MI, unsigned Idx, + std::map<unsigned, Chain*> &ActiveChains, + std::set<std::unique_ptr<Chain>> &AllChains) { + // Inspect "MI", updating ActiveChains and AllChains. + + if (isMul(MI)) { + + for (auto &I : MI->uses()) + maybeKillChain(I, Idx, ActiveChains); + for (auto &I : MI->defs()) + maybeKillChain(I, Idx, ActiveChains); + + // Create a new chain. Multiplies don't require forwarding so can go on any + // unit. + unsigned DestReg = MI->getOperand(0).getReg(); + + DEBUG(dbgs() << "New chain started for register " + << TRI->getName(DestReg) << " at " << *MI); + + auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + ActiveChains[DestReg] = G.get(); + AllChains.insert(std::move(G)); + + } else if (isMla(MI)) { + + // It is beneficial to keep MLAs on the same functional unit as their + // accumulator operand. + unsigned DestReg = MI->getOperand(0).getReg(); + unsigned AccumReg = MI->getOperand(3).getReg(); + + maybeKillChain(MI->getOperand(1), Idx, ActiveChains); + maybeKillChain(MI->getOperand(2), Idx, ActiveChains); + if (DestReg != AccumReg) + maybeKillChain(MI->getOperand(0), Idx, ActiveChains); + + if (ActiveChains.find(AccumReg) != ActiveChains.end()) { + DEBUG(dbgs() << "Chain found for accumulator register " + << TRI->getName(AccumReg) << " in MI " << *MI); + + // For simplicity we only chain together sequences of MULs/MLAs where the + // accumulator register is killed on each instruction. This means we don't + // need to track other uses of the registers we want to rewrite. + // + // FIXME: We could extend to handle the non-kill cases for more coverage. + if (MI->getOperand(3).isKill()) { + // Add to chain. + DEBUG(dbgs() << "Instruction was successfully added to chain.\n"); + ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg)); + // Handle cases where the destination is not the same as the accumulator. + if (DestReg != AccumReg) { + ActiveChains[DestReg] = ActiveChains[AccumReg]; + ActiveChains.erase(AccumReg); + } + return; + } + + DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't " + << "marked <kill>!\n"); + maybeKillChain(MI->getOperand(3), Idx, ActiveChains); + } + + DEBUG(dbgs() << "Creating new chain for dest register " + << TRI->getName(DestReg) << "\n"); + auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); + ActiveChains[DestReg] = G.get(); + AllChains.insert(std::move(G)); + + } else { + + // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs + // lists. + for (auto &I : MI->uses()) + maybeKillChain(I, Idx, ActiveChains); + for (auto &I : MI->defs()) + maybeKillChain(I, Idx, ActiveChains); + + } +} + +void AArch64A57FPLoadBalancing:: +maybeKillChain(MachineOperand &MO, unsigned Idx, + std::map<unsigned, Chain*> &ActiveChains) { + // Given an operand and the set of active chains (keyed by register), + // determine if a chain should be ended and remove from ActiveChains. + MachineInstr *MI = MO.getParent(); + + if (MO.isReg()) { + + // If this is a KILL of a current chain, record it. + if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) { + DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg()) + << "\n"); + ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); + } + ActiveChains.erase(MO.getReg()); + + } else if (MO.isRegMask()) { + + for (auto I = ActiveChains.begin(), E = ActiveChains.end(); + I != E;) { + if (MO.clobbersPhysReg(I->first)) { + DEBUG(dbgs() << "Kill (regmask) seen for chain " + << TRI->getName(I->first) << "\n"); + I->second->setKill(MI, Idx, /*Immutable=*/true); + ActiveChains.erase(I++); + } else + ++I; + } + + } +} + +Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) { + if ((TRI->getEncodingValue(Reg) % 2) == 0) + return Color::Even; + else + return Color::Odd; +} + +// Factory function used by AArch64TargetMachine to add the pass to the passmanager. +FunctionPass *llvm::createAArch64A57FPLoadBalancing() { + return new AArch64A57FPLoadBalancing(); +} diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index ab2c4b7..287989f 100644 --- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -19,7 +19,7 @@ // a = add nsw i64 f, 3 // e = getelementptr ..., i64 a // -// This is legal to do so if the computations are markers with either nsw or nuw +// This is legal to do if the computations are marked with either nsw or nuw // markers. // Moreover, the current heuristic is simple: it does not create new sext // operations, i.e., it gives up when a sext would have forked (e.g., if @@ -223,7 +223,7 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { } // Input: -// - SExtInsts contains all the sext instructions that are use direclty in +// - SExtInsts contains all the sext instructions that are used directly in // GetElementPtrInst, i.e., access to memory. // Algorithm: // - For each sext operation in SExtInsts: @@ -353,7 +353,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { // If the use is already of the right type, connect its uses to its argument // and delete it. - // This can happen for an Instruction which all uses are sign extended. + // This can happen for an Instruction all uses of which are sign extended. if (!ToRemove.count(SExt) && SExt->getType() == SExt->getOperand(0)->getType()) { DEBUG(dbgs() << "Sign extension is useless, attach its use to " diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 734fb21..5afe0f4 100644 --- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -36,9 +36,10 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -166,6 +167,12 @@ static int getTransformOpcode(unsigned Opc) { return AArch64::ADDv1i64; case AArch64::SUBXrr: return AArch64::SUBv1i64; + case AArch64::ANDXrr: + return AArch64::ANDv8i8; + case AArch64::EORXrr: + return AArch64::EORv8i8; + case AArch64::ORRXrr: + return AArch64::ORRv8i8; } // No AdvSIMD equivalent, so just return the original opcode. return Opc; @@ -371,7 +378,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { const TargetMachine &TM = mf.getTarget(); MRI = &mf.getRegInfo(); - TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo()); + TII = static_cast<const AArch64InstrInfo *>( + TM.getSubtargetImpl()->getInstrInfo()); // Just check things on a one-block-at-a-time basis. for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index cd94e24..08ee687 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64MachineFunctionInfo.h" #include "AArch64MCInstLower.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "InstPrinter/AArch64InstPrinter.h" @@ -23,8 +23,8 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -54,7 +54,7 @@ public: AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) : AsmPrinter(TM, Streamer), Subtarget(&TM.getSubtarget<AArch64Subtarget>()), - MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr), + MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr), LOHLabelCounter(0) {} const char *getPassName() const override { @@ -145,7 +145,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getDataLayout(); + const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer.EmitLabel(Stubs[i].first); @@ -252,8 +252,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = - static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo()); + const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -381,8 +381,23 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, unsigned NumNOPBytes = MI.getOperand(1).getImm(); SM.recordStackMap(MI); - // Emit padding. assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + + // Scan ahead to trim the shadow. + const MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::const_iterator MII(MI); + ++MII; + while (NumNOPBytes > 0) { + if (MII == MBB.end() || MII->isCall() || + MII->getOpcode() == AArch64::DBG_VALUE || + MII->getOpcode() == TargetOpcode::PATCHPOINT || + MII->getOpcode() == TargetOpcode::STACKMAP) + break; + ++MII; + NumNOPBytes -= 4; + } + + // Emit nops. for (unsigned i = 0; i < NumNOPBytes; i += 4) EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } @@ -518,7 +533,5 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { extern "C" void LLVMInitializeAArch64AsmPrinter() { RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget); RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget); - - RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget); - RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget); + RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target); } diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp index 484e7e8..e2b6367 100644 --- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -12,15 +12,16 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" using namespace llvm; #define DEBUG_TYPE "aarch64-branch-relax" @@ -136,7 +137,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) { if (NextBB == MBB->getParent()->end()) return false; - for (MachineBasicBlock *S : MBB->successors()) + for (MachineBasicBlock *S : MBB->successors()) if (S == NextBB) return true; @@ -475,7 +476,9 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n"); - TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo(); + TII = (const AArch64InstrInfo *)MF->getTarget() + .getSubtargetImpl() + ->getInstrInfo(); // Renumber all of the machine basic blocks in the function, guaranteeing that // the numbers agree with the position of the block in the function. diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h new file mode 100644 index 0000000..baf80bc --- /dev/null +++ b/lib/Target/AArch64/AArch64CallingConvention.h @@ -0,0 +1,141 @@ +//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the AArch64 Calling Convention +// that aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace { +using namespace llvm; + +static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; + +static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, + MVT LocVT, ISD::ArgFlagsTy &ArgFlags, + CCState &State, unsigned SlotAlign) { + unsigned Size = LocVT.getSizeInBits() / 8; + unsigned StackAlign = State.getMachineFunction() + .getSubtarget() + .getDataLayout() + ->getStackAlignment(); + unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); + + for (auto &It : PendingMembers) { + It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign))); + State.addLoc(It); + SlotAlign = 1; + } + + // All pending members have now been allocated + PendingMembers.clear(); + return true; +} + +/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An +/// [N x Ty] type must still be contiguous in memory though. +static bool CC_AArch64_Custom_Stack_Block( + unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8); +} + +/// Given an [N x Ty] block, it should be passed in a consecutive sequence of +/// registers. If no such sequence is available, mark the rest of the registers +/// of that type as used and place the argument on the stack. +static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + // Try to allocate a contiguous block of registers, each of the correct + // size to hold one member. + ArrayRef<uint16_t> RegList; + if (LocVT.SimpleTy == MVT::i64) + RegList = XRegList; + else if (LocVT.SimpleTy == MVT::f16) + RegList = HRegList; + else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector()) + RegList = SRegList; + else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector()) + RegList = DRegList; + else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector()) + RegList = QRegList; + else { + // Not an array we want to split up after all. + return false; + } + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // Add the argument to the list to be allocated once we know the size of the + // block. + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); + if (RegResult) { + for (auto &It : PendingMembers) { + It.convertToReg(RegResult); + State.addLoc(It); + ++RegResult; + } + PendingMembers.clear(); + return true; + } + + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + + const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>( + State.getMachineFunction().getSubtarget()); + unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8; + + return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign); +} + +} + +#endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 1fe5138..1a80402 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> : CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : - CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>; + CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>; //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention @@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -60,18 +62,18 @@ def CC_AArch64_AAPCS : CallingConv<[ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToStack<16, 16>> ]>; @@ -96,10 +98,10 @@ def RetCC_AArch64_AAPCS : CallingConv<[ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; @@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ // slot is 64-bit. CCIfByVal<CCPassByVal<8, 8>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -139,25 +143,28 @@ def CC_AArch64_DarwinPCS : CallingConv<[ [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> ]>; def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCIfType<[v2f32], CCBitConvertToType<v2i32>>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>, + CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>, + // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, CCIfType<[f16, f32], CCPromoteToType<f64>>, @@ -165,8 +172,10 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCAssignToStack<16, 16>> ]>; // The WebKit_JS calling convention only passes the first argument (the callee) diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 4d23dc5..aab8e38 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -94,7 +94,7 @@ struct LDTLSCleanup : public MachineFunctionPass { MachineFunction *MF = I->getParent()->getParent(); const AArch64TargetMachine *TM = static_cast<const AArch64TargetMachine *>(&MF->getTarget()); - const AArch64InstrInfo *TII = TM->getInstrInfo(); + const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the // code sequence assumes the address will be. @@ -114,7 +114,7 @@ struct LDTLSCleanup : public MachineFunctionPass { MachineFunction *MF = I->getParent()->getParent(); const AArch64TargetMachine *TM = static_cast<const AArch64TargetMachine *>(&MF->getTarget()); - const AArch64InstrInfo *TII = TM->getInstrInfo(); + const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp index 6b1f096..87b545b 100644 --- a/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -101,25 +101,26 @@ #include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-collect-loh" @@ -194,12 +195,14 @@ typedef SetVector<const MachineInstr *> SetOfMachineInstr; /// Map a basic block to a set of instructions per register. /// This is used to represent the exposed uses of a basic block /// per register. -typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *> +typedef MapVector<const MachineBasicBlock *, + std::unique_ptr<SetOfMachineInstr[]>> BlockToSetOfInstrsPerColor; /// Map a basic block to an instruction per register. /// This is used to represent the live-out definitions of a basic block /// per register. -typedef MapVector<const MachineBasicBlock *, const MachineInstr **> +typedef MapVector<const MachineBasicBlock *, + std::unique_ptr<const MachineInstr *[]>> BlockToInstrPerColor; /// Map an instruction to a set of instructions. Used to represent the /// mapping def to reachable uses or use to definitions. @@ -236,9 +239,9 @@ static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets, SetOfMachineInstr *result; BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB); if (it != sets.end()) - result = it->second; + result = it->second.get(); else - result = sets[&MBB] = new SetOfMachineInstr[nbRegs]; + result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get(); return result[reg]; } @@ -283,14 +286,14 @@ static void initReachingDef(MachineFunction &MF, const MapRegToId &RegToId, const MachineInstr *DummyOp, bool ADRPMode) { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo(); unsigned NbReg = RegToId.size(); for (MachineBasicBlock &MBB : MF) { - const MachineInstr **&BBGen = Gen[&MBB]; - BBGen = new const MachineInstr *[NbReg]; - memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg); + auto &BBGen = Gen[&MBB]; + BBGen = make_unique<const MachineInstr *[]>(NbReg); + std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr); BitVector &BBKillSet = Kill[&MBB]; BBKillSet.resize(NbReg); @@ -421,22 +424,6 @@ static void reachingDefAlgorithm(MachineFunction &MF, } while (HasChanged); } -/// Release all memory dynamically allocated during the reaching -/// definition algorithm. -static void finitReachingDef(BlockToSetOfInstrsPerColor &In, - BlockToSetOfInstrsPerColor &Out, - BlockToInstrPerColor &Gen, - BlockToSetOfInstrsPerColor &ReachableUses) { - for (auto &IT : Out) - delete[] IT.second; - for (auto &IT : In) - delete[] IT.second; - for (auto &IT : ReachableUses) - delete[] IT.second; - for (auto &IT : Gen) - delete[] IT.second; -} - /// Reaching definition algorithm. /// \param MF function on which the algorithm will operate. /// \param[out] ColorOpToReachedUses will contain the result of the reaching @@ -473,9 +460,6 @@ static void reachingDef(MachineFunction &MF, if (!DummyOp) reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill, ReachableUses, RegToId.size()); - - // finit. - finitReachingDef(In, Out, Gen, ReachableUses); } #ifndef NDEBUG @@ -1043,7 +1027,7 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId, bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo(); const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>(); MapRegToId RegToId; @@ -1059,8 +1043,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { MachineInstr *DummyOp = nullptr; if (BasicBlockScopeOnly) { - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(TM.getInstrInfo()); + const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>( + TM.getSubtargetImpl()->getInstrInfo()); // For local analysis, create a dummy operation to record uses that are not // local. DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc()); diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp new file mode 100644 index 0000000..0fbd3c6 --- /dev/null +++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -0,0 +1,422 @@ +//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to make consecutive compares of values use same operands to +// allow CSE pass to remove duplicated instructions. For this it analyzes +// branches and adjusts comparisons with immediate values by converting: +// * GE -> GT +// * GT -> GE +// * LT -> LE +// * LE -> LT +// and adjusting immediate values appropriately. It basically corrects two +// immediate values towards each other to make them equal. +// +// Consider the following example in C: +// +// if ((a < 5 && ...) || (a > 5 && ...)) { +// ~~~~~ ~~~~~ +// ^ ^ +// x y +// +// Here both "x" and "y" expressions compare "a" with "5". When "x" evaluates +// to "false", "y" can just check flags set by the first comparison. As a +// result of the canonicalization employed by +// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific +// code, assembly ends up in the form that is not CSE friendly: +// +// ... +// cmp w8, #4 +// b.gt .LBB0_3 +// ... +// .LBB0_3: +// cmp w8, #6 +// b.lt .LBB0_6 +// ... +// +// Same assembly after the pass: +// +// ... +// cmp w8, #5 +// b.ge .LBB0_3 +// ... +// .LBB0_3: +// cmp w8, #5 // <-- CSE pass removes this instruction +// b.le .LBB0_6 +// ... +// +// Currently only SUBS and ADDS followed by b.?? are supported. +// +// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0" +// TODO: handle other conditional instructions (e.g. CSET) +// TODO: allow second branching to be anything if it doesn't require adjusting +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <cstdlib> +#include <tuple> + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-condopt" + +STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted"); + +namespace { +class AArch64ConditionOptimizer : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineDominatorTree *DomTree; + +public: + // Stores immediate, compare instruction opcode and branch condition (in this + // order) of adjusted comparison. + typedef std::tuple<int, int, AArch64CC::CondCode> CmpInfo; + + static char ID; + AArch64ConditionOptimizer() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + MachineInstr *findSuitableCompare(MachineBasicBlock *MBB); + CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp); + void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info); + bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To, + int ToImm); + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 Condition Optimizer"; + } +}; +} // end anonymous namespace + +char AArch64ConditionOptimizer::ID = 0; + +namespace llvm { +void initializeAArch64ConditionOptimizerPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt", + "AArch64 CondOpt Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt", + "AArch64 CondOpt Pass", false, false) + +FunctionPass *llvm::createAArch64ConditionOptimizerPass() { + return new AArch64ConditionOptimizer(); +} + +void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +// Finds compare instruction that corresponds to supported types of branching. +// Returns the instruction or nullptr on failures or detecting unsupported +// instructions. +MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( + MachineBasicBlock *MBB) { + MachineBasicBlock::iterator I = MBB->getFirstTerminator(); + if (I == MBB->end()) + return nullptr; + + if (I->getOpcode() != AArch64::Bcc) + return nullptr; + + // Now find the instruction controlling the terminator. + for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { + --I; + assert(!I->isTerminator() && "Spurious terminator"); + switch (I->getOpcode()) { + // cmp is an alias for subs with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + // cmn is an alias for adds with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: + if (I->getOperand(0).isDead()) + return I; + + DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + return nullptr; + + // Prevent false positive case like: + // cmp w19, #0 + // cinc w0, w19, gt + // ... + // fcmp d8, #0.0 + // b.gt .LBB0_5 + case AArch64::FCMPDri: + case AArch64::FCMPSri: + case AArch64::FCMPESri: + case AArch64::FCMPEDri: + + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + case AArch64::FCMPSrr: + case AArch64::FCMPDrr: + case AArch64::FCMPESrr: + case AArch64::FCMPEDrr: + // Skip comparison instructions without immediate operands. + return nullptr; + } + } + DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + return nullptr; +} + +// Changes opcode adds <-> subs considering register operand width. +static int getComplementOpc(int Opc) { + switch (Opc) { + case AArch64::ADDSWri: return AArch64::SUBSWri; + case AArch64::ADDSXri: return AArch64::SUBSXri; + case AArch64::SUBSWri: return AArch64::ADDSWri; + case AArch64::SUBSXri: return AArch64::ADDSXri; + default: + llvm_unreachable("Unexpected opcode"); + } +} + +// Changes form of comparison inclusive <-> exclusive. +static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) { + switch (Cmp) { + case AArch64CC::GT: return AArch64CC::GE; + case AArch64CC::GE: return AArch64CC::GT; + case AArch64CC::LT: return AArch64CC::LE; + case AArch64CC::LE: return AArch64CC::LT; + default: + llvm_unreachable("Unexpected condition code"); + } +} + +// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison +// operator and condition code. +AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp( + MachineInstr *CmpMI, AArch64CC::CondCode Cmp) { + int Opc = CmpMI->getOpcode(); + + // CMN (compare with negative immediate) is an alias to ADDS (as + // "operand - negative" == "operand + positive") + bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri); + + int Correction = (Cmp == AArch64CC::GT) ? 1 : -1; + // Negate Correction value for comparison with negative immediate (CMN). + if (Negative) { + Correction = -Correction; + } + + const int OldImm = (int)CmpMI->getOperand(2).getImm(); + const int NewImm = std::abs(OldImm + Correction); + + // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by + // adjusting compare instruction opcode. + if (OldImm == 0 && ((Negative && Correction == 1) || + (!Negative && Correction == -1))) { + Opc = getComplementOpc(Opc); + } + + return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp)); +} + +// Applies changes to comparison instruction suggested by adjustCmp(). +void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI, + const CmpInfo &Info) { + int Imm; + int Opc; + AArch64CC::CondCode Cmp; + std::tie(Imm, Opc, Cmp) = Info; + + MachineBasicBlock *const MBB = CmpMI->getParent(); + + // Change immediate in comparison instruction (ADDS or SUBS). + BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc)) + .addOperand(CmpMI->getOperand(0)) + .addOperand(CmpMI->getOperand(1)) + .addImm(Imm) + .addOperand(CmpMI->getOperand(3)); + CmpMI->eraseFromParent(); + + // The fact that this comparison was picked ensures that it's related to the + // first terminator instruction. + MachineInstr *BrMI = MBB->getFirstTerminator(); + + // Change condition in branch instruction. + BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(Cmp) + .addOperand(BrMI->getOperand(1)); + BrMI->eraseFromParent(); + + MBB->updateTerminator(); + + ++NumConditionsAdjusted; +} + +// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// corresponding to TBB. +// Returns true if parsing was successful, otherwise false is returned. +static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) { + // A normal br.cond simply has the condition code. + if (Cond[0].getImm() != -1) { + assert(Cond.size() == 1 && "Unknown Cond array format"); + CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + return true; + } + return false; +} + +// Adjusts one cmp instruction to another one if result of adjustment will allow +// CSE. Returns true if compare instruction was changed, otherwise false is +// returned. +bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI, + AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm) +{ + CmpInfo Info = adjustCmp(CmpMI, Cmp); + if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) { + modifyCmp(CmpMI, Info); + return true; + } + return false; +} + +bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" + << "********** Function: " << MF.getName() << '\n'); + TII = MF.getTarget().getSubtargetImpl()->getInstrInfo(); + DomTree = &getAnalysis<MachineDominatorTree>(); + + bool Changed = false; + + // Visit blocks in dominator tree pre-order. The pre-order enables multiple + // cmp-conversions from the same head block. + // Note that updateDomTree() modifies the children of the DomTree node + // currently being visited. The df_iterator supports that; it doesn't look at + // child_begin() / child_end() until after a node has been visited. + for (MachineDomTreeNode *I : depth_first(DomTree)) { + MachineBasicBlock *HBB = I->getBlock(); + + SmallVector<MachineOperand, 4> HeadCond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) { + continue; + } + + // Equivalence check is to skip loops. + if (!TBB || TBB == HBB) { + continue; + } + + SmallVector<MachineOperand, 4> TrueCond; + MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr; + if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) { + continue; + } + + MachineInstr *HeadCmpMI = findSuitableCompare(HBB); + if (!HeadCmpMI) { + continue; + } + + MachineInstr *TrueCmpMI = findSuitableCompare(TBB); + if (!TrueCmpMI) { + continue; + } + + AArch64CC::CondCode HeadCmp; + if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) { + continue; + } + + AArch64CC::CondCode TrueCmp; + if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) { + continue; + } + + const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm(); + const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm(); + + DEBUG(dbgs() << "Head branch:\n"); + DEBUG(dbgs() << "\tcondition: " + << AArch64CC::getCondCodeName(HeadCmp) << '\n'); + DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n'); + + DEBUG(dbgs() << "True branch:\n"); + DEBUG(dbgs() << "\tcondition: " + << AArch64CC::getCondCodeName(TrueCmp) << '\n'); + DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n'); + + if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) && + std::abs(TrueImm - HeadImm) == 2) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...) + // + // into + // + // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...) + // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...) + + CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp); + CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp); + if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) && + std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) { + modifyCmp(HeadCmpMI, HeadCmpInfo); + modifyCmp(TrueCmpMI, TrueCmpInfo); + Changed = true; + } + } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) || + (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) && + std::abs(TrueImm - HeadImm) == 1) { + // This branch transforms machine instructions that correspond to + // + // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...) + // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...) + // + // into + // + // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...) + // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...) + + // GT -> GE transformation increases immediate value, so picking the + // smaller one; LT -> LE decreases immediate value so invert the choice. + bool adjustHeadCond = (HeadImm < TrueImm); + if (HeadCmp == AArch64CC::LT) { + adjustHeadCond = !adjustHeadCond; + } + + if (adjustHeadCond) { + Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm); + } else { + Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm); + } + } + // Other transformation cases almost never occur due to generation of < or > + // comparisons instead of <= and >=. + } + + return Changed; +} diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 452cdec..54f53dc 100644 --- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -191,8 +191,8 @@ public: /// runOnMachineFunction - Initialize per-function data structures. void runOnMachineFunction(MachineFunction &MF) { this->MF = &MF; - TII = MF.getTarget().getInstrInfo(); - TRI = MF.getTarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); } @@ -723,7 +723,7 @@ namespace { class AArch64ConditionalCompares : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; - const MCSchedModel *SchedModel; + MCSchedModel SchedModel; // Does the proceeded function has Oz attribute. bool MinSize; MachineRegisterInfo *MRI; @@ -845,7 +845,7 @@ bool AArch64ConditionalCompares::shouldConvert() { // the cost of a misprediction. // // Set a limit on the delay we will accept. - unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4; + unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4; // Instruction depths can be computed for all trace instructions above CmpBB. unsigned HeadDepth = @@ -891,8 +891,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" << "********** Function: " << MF.getName() << '\n'); - TII = MF.getTarget().getInstrInfo(); - TRI = MF.getTarget().getRegisterInfo(); + TII = MF.getSubtarget().getInstrInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); SchedModel = MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel(); MRI = &MF.getRegInfo(); diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index a2d853c..74fc167 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -14,11 +14,12 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-dead-defs" @@ -36,11 +37,11 @@ public: static char ID; // Pass identification, replacement for typeid. explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} - virtual bool runOnMachineFunction(MachineFunction &F) override; + bool runOnMachineFunction(MachineFunction &F) override; const char *getPassName() const override { return "Dead register definitions"; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -119,7 +120,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( // Scan the function for instructions that have a dead definition of a // register. Replace that register with the zero register when possible. bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { - TRI = MF.getTarget().getRegisterInfo(); + TRI = MF.getSubtarget().getRegisterInfo(); bool Changed = false; DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 8839085..c850680 100644 --- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/Support/MathExtras.h" @@ -722,7 +723,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { } bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo()); + TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); bool Modified = false; for (auto &MBB : MF) diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 2164d77..419fbc8 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -14,9 +14,11 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64TargetMachine.h" +#include "AArch64CallingConvention.h" #include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -39,8 +41,7 @@ using namespace llvm; namespace { -class AArch64FastISel : public FastISel { - +class AArch64FastISel final : public FastISel { class Address { public: typedef enum { @@ -50,16 +51,23 @@ class AArch64FastISel : public FastISel { private: BaseKind Kind; + AArch64_AM::ShiftExtendType ExtType; union { unsigned Reg; int FI; } Base; + unsigned OffsetReg; + unsigned Shift; int64_t Offset; + const GlobalValue *GV; public: - Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; } + Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend), + OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; } void setKind(BaseKind K) { Kind = K; } BaseKind getKind() const { return Kind; } + void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; } + AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; } bool isRegBase() const { return Kind == RegBase; } bool isFIBase() const { return Kind == FrameIndexBase; } void setReg(unsigned Reg) { @@ -70,6 +78,12 @@ class AArch64FastISel : public FastISel { assert(isRegBase() && "Invalid base register access!"); return Base.Reg; } + void setOffsetReg(unsigned Reg) { + OffsetReg = Reg; + } + unsigned getOffsetReg() const { + return OffsetReg; + } void setFI(unsigned FI) { assert(isFIBase() && "Invalid base frame index access!"); Base.FI = FI; @@ -80,8 +94,11 @@ class AArch64FastISel : public FastISel { } void setOffset(int64_t O) { Offset = O; } int64_t getOffset() { return Offset; } + void setShift(unsigned S) { Shift = S; } + unsigned getShift() { return Shift; } - bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); } + void setGlobalValue(const GlobalValue *G) { GV = G; } + const GlobalValue *getGlobalValue() { return GV; } }; /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can @@ -89,74 +106,152 @@ class AArch64FastISel : public FastISel { const AArch64Subtarget *Subtarget; LLVMContext *Context; + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; + private: // Selection routines. - bool SelectLoad(const Instruction *I); - bool SelectStore(const Instruction *I); - bool SelectBranch(const Instruction *I); - bool SelectIndirectBr(const Instruction *I); - bool SelectCmp(const Instruction *I); - bool SelectSelect(const Instruction *I); - bool SelectFPExt(const Instruction *I); - bool SelectFPTrunc(const Instruction *I); - bool SelectFPToInt(const Instruction *I, bool Signed); - bool SelectIntToFP(const Instruction *I, bool Signed); - bool SelectRem(const Instruction *I, unsigned ISDOpcode); - bool SelectCall(const Instruction *I, const char *IntrMemName); - bool SelectIntrinsicCall(const IntrinsicInst &I); - bool SelectRet(const Instruction *I); - bool SelectTrunc(const Instruction *I); - bool SelectIntExt(const Instruction *I); - bool SelectMul(const Instruction *I); + bool selectAddSub(const Instruction *I); + bool selectLogicalOp(const Instruction *I); + bool selectLoad(const Instruction *I); + bool selectStore(const Instruction *I); + bool selectBranch(const Instruction *I); + bool selectIndirectBr(const Instruction *I); + bool selectCmp(const Instruction *I); + bool selectSelect(const Instruction *I); + bool selectFPExt(const Instruction *I); + bool selectFPTrunc(const Instruction *I); + bool selectFPToInt(const Instruction *I, bool Signed); + bool selectIntToFP(const Instruction *I, bool Signed); + bool selectRem(const Instruction *I, unsigned ISDOpcode); + bool selectRet(const Instruction *I); + bool selectTrunc(const Instruction *I); + bool selectIntExt(const Instruction *I); + bool selectMul(const Instruction *I); + bool selectShift(const Instruction *I); + bool selectBitCast(const Instruction *I); + bool selectFRem(const Instruction *I); + bool selectSDiv(const Instruction *I); + bool selectGetElementPtr(const Instruction *I); // Utility helper routines. bool isTypeLegal(Type *Ty, MVT &VT); - bool isLoadStoreTypeLegal(Type *Ty, MVT &VT); - bool ComputeAddress(const Value *Obj, Address &Addr); - bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor, - bool UseUnscaled); - void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, bool UseUnscaled); - bool IsMemCpySmall(uint64_t Len, unsigned Alignment); - bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, + bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false); + bool isValueAvailable(const Value *V) const; + bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr); + bool computeCallAddress(const Value *V, Address &Addr); + bool simplifyAddress(Address &Addr, MVT VT); + void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, + unsigned Flags, unsigned ScaleFactor, + MachineMemOperand *MMO); + bool isMemCpySmall(uint64_t Len, unsigned Alignment); + bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, unsigned Alignment); - // Emit functions. - bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt); - bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, - bool UseUnscaled = false); - bool EmitStore(MVT VT, unsigned SrcReg, Address Addr, - bool UseUnscaled = false); - unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); - unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt); + bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I, + const Value *Cond); + bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT); + bool optimizeSelect(const SelectInst *SI); + std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx); + + // Emit helper routines. + unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, + const Value *RHS, bool SetFlags = false, + bool WantResult = true, bool IsZExt = false); + unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + bool SetFlags = false, bool WantResult = true); + unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm, bool SetFlags = false, + bool WantResult = true); + unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool SetFlags = false, + bool WantResult = true); + unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ExtType, + uint64_t ShiftImm, bool SetFlags = false, + bool WantResult = true); - unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT); - unsigned AArch64MaterializeGV(const GlobalValue *GV); + // Emit functions. + bool emitCompareAndBranch(const BranchInst *BI); + bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt); + bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt); + bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); + bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS); + unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true, + MachineMemOperand *MMO = nullptr); + bool emitStore(MVT VT, unsigned SrcReg, Address Addr, + MachineMemOperand *MMO = nullptr); + unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags = false, bool WantResult = true, + bool IsZExt = false); + unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm); + unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags = false, bool WantResult = true, + bool IsZExt = false); + unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, bool WantResult = true); + unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm, + bool WantResult = true); + unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS, + const Value *RHS); + unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm); + unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, bool RHSIsKill, + uint64_t ShiftImm); + unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm); + unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = true); + unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = true); + unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill); + unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill, + uint64_t Imm, bool IsZExt = false); + + unsigned materializeInt(const ConstantInt *CI, MVT VT); + unsigned materializeFP(const ConstantFP *CFP, MVT VT); + unsigned materializeGV(const GlobalValue *GV); // Call handling routines. private: CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const; - bool ProcessCallArgs(SmallVectorImpl<Value *> &Args, - SmallVectorImpl<unsigned> &ArgRegs, - SmallVectorImpl<MVT> &ArgVTs, - SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC, + bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs, unsigned &NumBytes); - bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, - const Instruction *I, CallingConv::ID CC, unsigned &NumBytes); + bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes); public: // Backend specific FastISel code. - unsigned TargetMaterializeAlloca(const AllocaInst *AI) override; - unsigned TargetMaterializeConstant(const Constant *C) override; + unsigned fastMaterializeAlloca(const AllocaInst *AI) override; + unsigned fastMaterializeConstant(const Constant *C) override; + unsigned fastMaterializeFloatZero(const ConstantFP* CF) override; - explicit AArch64FastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) - : FastISel(funcInfo, libInfo) { + explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { Subtarget = &TM.getSubtarget<AArch64Subtarget>(); - Context = &funcInfo.Fn->getContext(); + Context = &FuncInfo.Fn->getContext(); } - bool TargetSelectInstruction(const Instruction *I) override; + bool fastSelectInstruction(const Instruction *I) override; #include "AArch64GenFastISel.inc" }; @@ -165,13 +260,52 @@ public: #include "AArch64GenCallingConv.inc" +/// \brief Check if the sign-/zero-extend will be a noop. +static bool isIntExtFree(const Instruction *I) { + assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) && + "Unexpected integer extend instruction."); + assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() && + "Unexpected value type."); + bool IsZExt = isa<ZExtInst>(I); + + if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0))) + if (LI->hasOneUse()) + return true; + + if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) + return true; + + return false; +} + +/// \brief Determine the implicit scale factor that is applied by a memory +/// operation for a given value type. +static unsigned getImplicitScaleFactor(MVT VT) { + switch (VT.SimpleTy) { + default: + return 0; // invalid + case MVT::i1: // fall-through + case MVT::i8: + return 1; + case MVT::i16: + return 2; + case MVT::i32: // fall-through + case MVT::f32: + return 4; + case MVT::i64: // fall-through + case MVT::f64: + return 8; + } +} + CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { if (CC == CallingConv::WebKit_JS) return CC_AArch64_WebKit_JS; return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; } -unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) { +unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) { assert(TLI.getValueType(AI->getType(), true) == MVT::i64 && "Alloca should always return a pointer."); @@ -183,7 +317,7 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) { FuncInfo.StaticAllocaMap.find(AI); if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) .addFrameIndex(SI->second) @@ -195,28 +329,59 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) { return 0; } -unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) { +unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) + return 0; + + if (!CI->isZero()) + return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue()); + + // Create a copy from the zero register to materialize a "0" value. + const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass + : &AArch64::GPR32RegClass; + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), + ResultReg).addReg(ZeroReg, getKillRegState(true)); + return ResultReg; +} + +unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) { + // Positive zero (+0.0) has to be materialized with a fmov from the zero + // register, because the immediate version of fmov cannot encode zero. + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + if (VT != MVT::f32 && VT != MVT::f64) return 0; const APFloat Val = CFP->getValueAPF(); - bool is64bit = (VT == MVT::f64); - + bool Is64Bit = (VT == MVT::f64); // This checks to see if we can use FMOV instructions to materialize // a constant, otherwise we have to materialize via the constant pool. if (TLI.isFPImmLegal(Val, VT)) { - int Imm; - unsigned Opc; - if (is64bit) { - Imm = AArch64_AM::getFP64Imm(Val); - Opc = AArch64::FMOVDi; - } else { - Imm = AArch64_AM::getFP32Imm(Val); - Opc = AArch64::FMOVSi; - } + int Imm = + Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val); + assert((Imm != -1) && "Cannot encode floating-point constant."); + unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi; + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); + } + + // For the MachO large code model materialize the FP constant in code. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + const TargetRegisterClass *RC = Is64Bit ? + &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg) + .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue()); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addImm(Imm); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(TmpReg, getKillRegState(true)); + return ResultReg; } @@ -226,20 +391,20 @@ unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) { if (Align == 0) Align = DL.getTypeAllocSize(CFP->getType()); - unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), - ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE); + ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE); - unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui; + unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui; unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(ADRPReg) - .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); return ResultReg; } -unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) { +unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) { // We can't handle thread-local variables quickly yet. if (GV->isThreadLocal()) return 0; @@ -262,30 +427,34 @@ unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) { // ADRP + LDRX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), ADRPReg) - .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE); + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE); ResultReg = createResultReg(&AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | - AArch64II::MO_NC); + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } else if (OpFlags & AArch64II::MO_CONSTPOOL) { + // We can't handle addresses loaded from a constant pool quickly yet. + return 0; } else { // ADRP + ADDX BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), - ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE); + ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_PAGE); ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC) - .addImm(0); + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC) + .addImm(0); } return ResultReg; } -unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) { +unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) { EVT CEVT = TLI.getValueType(C->getType(), true); // Only handle simple types. @@ -293,17 +462,48 @@ unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) { return 0; MVT VT = CEVT.getSimpleVT(); - // FIXME: Handle ConstantInt. - if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) - return AArch64MaterializeFP(CFP, VT); + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return materializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return materializeFP(CFP, VT); else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) - return AArch64MaterializeGV(GV); + return materializeGV(GV); return 0; } +unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) { + assert(CFP->isNullValue() && + "Floating-point constant is not a positive zero."); + MVT VT; + if (!isTypeLegal(CFP->getType(), VT)) + return 0; + + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + bool Is64Bit = (VT == MVT::f64); + unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr; + return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true); +} + +/// \brief Check if the multiply is by a power-of-2 constant. +static bool isMulPowOf2(const Value *I) { + if (const auto *MI = dyn_cast<MulOperator>(I)) { + if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0))) + if (C->getValue().isPowerOf2()) + return true; + if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1))) + if (C->getValue().isPowerOf2()) + return true; + } + return false; +} + // Computes the address to get to an object. -bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { +bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) +{ const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast<Instruction>(Obj)) { @@ -330,18 +530,18 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { break; case Instruction::BitCast: { // Look through bitcasts. - return ComputeAddress(U->getOperand(0), Addr); + return computeAddress(U->getOperand(0), Addr, Ty); } case Instruction::IntToPtr: { // Look past no-op inttoptrs. if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) - return ComputeAddress(U->getOperand(0), Addr); + return computeAddress(U->getOperand(0), Addr, Ty); break; } case Instruction::PtrToInt: { // Look past no-op ptrtoints. if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) - return ComputeAddress(U->getOperand(0), Addr); + return computeAddress(U->getOperand(0), Addr, Ty); break; } case Instruction::GetElementPtr: { @@ -383,7 +583,7 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { // Try to grab the base operand now. Addr.setOffset(TmpOffset); - if (ComputeAddress(U->getOperand(0), Addr)) + if (computeAddress(U->getOperand(0), Addr, Ty)) return true; // We failed, restore everything and try the other options. @@ -403,14 +603,301 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { } break; } + case Instruction::Add: { + // Adds of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (isa<ConstantInt>(LHS)) + std::swap(LHS, RHS); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + Addr.setOffset(Addr.getOffset() + CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + + Address Backup = Addr; + if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty)) + return true; + Addr = Backup; + + break; + } + case Instruction::Sub: { + // Subs of constants are common and easy enough. + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + Addr.setOffset(Addr.getOffset() - CI->getSExtValue()); + return computeAddress(LHS, Addr, Ty); + } + break; + } + case Instruction::Shl: { + if (Addr.getOffsetReg()) + break; + + const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1)); + if (!CI) + break; + + unsigned Val = CI->getZExtValue(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = U->getOperand(0); + if (const auto *I = dyn_cast<Instruction>(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; + + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + + if (const auto *AI = dyn_cast<BinaryOperator>(Src)) + if (AI->getOpcode() == Instruction::And) { + const Value *LHS = AI->getOperand(0); + const Value *RHS = AI->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setExtendType(AArch64_AM::UXTW); + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::Mul: { + if (Addr.getOffsetReg()) + break; + + if (!isMulPowOf2(U)) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + // Canonicalize power-of-2 value to the RHS. + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue().isPowerOf2()) + std::swap(LHS, RHS); + + assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt."); + const auto *C = cast<ConstantInt>(RHS); + unsigned Val = C->getValue().logBase2(); + if (Val < 1 || Val > 3) + break; + + uint64_t NumBytes = 0; + if (Ty && Ty->isSized()) { + uint64_t NumBits = DL.getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (NumBytes != (1ULL << Val)) + break; + + Addr.setShift(Val); + Addr.setExtendType(AArch64_AM::LSL); + + const Value *Src = LHS; + if (const auto *I = dyn_cast<Instruction>(Src)) + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + Src = I; + + + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(Src)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(Src)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + case Instruction::And: { + if (Addr.getOffsetReg()) + break; + + if (!Ty || DL.getTypeSizeInBits(Ty) != 8) + break; + + const Value *LHS = U->getOperand(0); + const Value *RHS = U->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(LHS)) + if (C->getValue() == 0xffffffff) + std::swap(LHS, RHS); + + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 0xffffffff) { + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::LSL); + Addr.setExtendType(AArch64_AM::UXTW); + + unsigned Reg = getRegForValue(LHS); + if (!Reg) + return false; + bool RegIsKill = hasTrivialKill(LHS); + Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill, + AArch64::sub_32); + Addr.setOffsetReg(Reg); + return true; + } + break; + } + case Instruction::SExt: + case Instruction::ZExt: { + if (!Addr.getReg() || Addr.getOffsetReg()) + break; + + const Value *Src = nullptr; + // Fold the zext or sext when it won't become a noop. + if (const auto *ZE = dyn_cast<ZExtInst>(U)) { + if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::UXTW); + Src = ZE->getOperand(0); + } + } else if (const auto *SE = dyn_cast<SExtInst>(U)) { + if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) { + Addr.setExtendType(AArch64_AM::SXTW); + Src = SE->getOperand(0); + } + } + + if (!Src) + break; + + Addr.setShift(0); + unsigned Reg = getRegForValue(Src); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + } // end switch + + if (Addr.isRegBase() && !Addr.getReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setReg(Reg); + return true; + } + + if (!Addr.getOffsetReg()) { + unsigned Reg = getRegForValue(Obj); + if (!Reg) + return false; + Addr.setOffsetReg(Reg); + return true; + } + + return false; +} + +bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + bool InMBB = true; + + if (const auto *I = dyn_cast<Instruction>(V)) { + Opcode = I->getOpcode(); + U = I; + InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); + } else if (const auto *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts if its operand is in the same BB. + if (InMBB) + return computeCallAddress(U->getOperand(0), Addr); + break; + case Instruction::IntToPtr: + // Look past no-op inttoptrs if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return computeCallAddress(U->getOperand(0), Addr); + break; + case Instruction::PtrToInt: + // Look past no-op ptrtoints if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return computeCallAddress(U->getOperand(0), Addr); + break; + } + + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + Addr.setGlobalValue(GV); + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!Addr.getGlobalValue()) { + Addr.setReg(getRegForValue(V)); + return Addr.getReg() != 0; } - // Try to get this in a register if nothing else has worked. - if (!Addr.isValid()) - Addr.setReg(getRegForValue(Obj)); - return Addr.isValid(); + return false; } + bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { EVT evt = TLI.getValueType(Ty, true); @@ -428,62 +915,122 @@ bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { return TLI.isTypeLegal(VT); } -bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) { +/// \brief Determine if the value type is supported by FastISel. +/// +/// FastISel for AArch64 can handle more value types than are legal. This adds +/// simple value type such as i1, i8, and i16. +bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) { + if (Ty->isVectorTy() && !IsVectorAllowed) + return false; + if (isTypeLegal(Ty, VT)) return true; // If this is a type than can be sign or zero-extended to a basic operation - // go ahead and accept it now. For stores, this reflects truncation. + // go ahead and accept it now. if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) return true; return false; } -bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT, - int64_t ScaleFactor, bool UseUnscaled) { - bool needsLowering = false; - int64_t Offset = Addr.getOffset(); - switch (VT.SimpleTy) { - default: +bool AArch64FastISel::isValueAvailable(const Value *V) const { + if (!isa<Instruction>(V)) + return true; + + const auto *I = cast<Instruction>(V); + if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) + return true; + + return false; +} + +bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - case MVT::i32: - case MVT::i64: - case MVT::f32: - case MVT::f64: - if (!UseUnscaled) - // Using scaled, 12-bit, unsigned immediate offsets. - needsLowering = ((Offset & 0xfff) != Offset); - else - // Using unscaled, 9-bit, signed immediate offsets. - needsLowering = (Offset > 256 || Offset < -256); - break; - } - //If this is a stack pointer and the offset needs to be simplified then put + bool ImmediateOffsetNeedsLowering = false; + bool RegisterOffsetNeedsLowering = false; + int64_t Offset = Addr.getOffset(); + if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset)) + ImmediateOffsetNeedsLowering = true; + else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) && + !isUInt<12>(Offset / ScaleFactor)) + ImmediateOffsetNeedsLowering = true; + + // Cannot encode an offset register and an immediate offset in the same + // instruction. Fold the immediate offset into the load/store instruction and + // emit an additonal add to take care of the offset register. + if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) + RegisterOffsetNeedsLowering = true; + + // Cannot encode zero register as base. + if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg()) + RegisterOffsetNeedsLowering = true; + + // If this is a stack pointer and the offset needs to be simplified then put // the alloca address into a register, set the base type back to register and // continue. This should almost never happen. - if (needsLowering && Addr.getKind() == Address::FrameIndexBase) { - unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase()) + { + unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), ResultReg) - .addFrameIndex(Addr.getFI()) - .addImm(0) - .addImm(0); + .addFrameIndex(Addr.getFI()) + .addImm(0) + .addImm(0); Addr.setKind(Address::RegBase); Addr.setReg(ResultReg); } + if (RegisterOffsetNeedsLowering) { + unsigned ResultReg = 0; + if (Addr.getReg()) { + if (Addr.getExtendType() == AArch64_AM::SXTW || + Addr.getExtendType() == AArch64_AM::UXTW ) + ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(), + /*TODO:IsKill=*/false, Addr.getOffsetReg(), + /*TODO:IsKill=*/false, Addr.getExtendType(), + Addr.getShift()); + else + ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(), + /*TODO:IsKill=*/false, Addr.getOffsetReg(), + /*TODO:IsKill=*/false, AArch64_AM::LSL, + Addr.getShift()); + } else { + if (Addr.getExtendType() == AArch64_AM::UXTW) + ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift(), + /*IsZExt=*/true); + else if (Addr.getExtendType() == AArch64_AM::SXTW) + ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift(), + /*IsZExt=*/false); + else + ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(), + /*Op0IsKill=*/false, Addr.getShift()); + } + if (!ResultReg) + return false; + + Addr.setReg(ResultReg); + Addr.setOffsetReg(0); + Addr.setShift(0); + Addr.setExtendType(AArch64_AM::InvalidShiftExtend); + } + // Since the offset is too large for the load/store instruction get the // reg+offset into a register. - if (needsLowering) { - uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor; - unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false, - UnscaledOffset, MVT::i64); - if (ResultReg == 0) + if (ImmediateOffsetNeedsLowering) { + unsigned ResultReg; + if (Addr.getReg()) + // Try to fold the immediate into the add instruction. + ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset); + else + ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset); + + if (!ResultReg) return false; Addr.setReg(ResultReg); Addr.setOffset(0); @@ -491,222 +1038,1021 @@ bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT, return true; } -void AArch64FastISel::AddLoadStoreOperands(Address &Addr, +void AArch64FastISel::addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, bool UseUnscaled) { - int64_t Offset = Addr.getOffset(); + unsigned Flags, + unsigned ScaleFactor, + MachineMemOperand *MMO) { + int64_t Offset = Addr.getOffset() / ScaleFactor; // Frame base works a bit differently. Handle it separately. - if (Addr.getKind() == Address::FrameIndexBase) { + if (Addr.isFIBase()) { int FI = Addr.getFI(); // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size // and alignment should be based on the VT. - MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. - MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO); + MIB.addFrameIndex(FI).addImm(Offset); } else { - // Now add the rest of the operands. - MIB.addReg(Addr.getReg()); - MIB.addImm(Offset); + assert(Addr.isRegBase() && "Unexpected address kind."); + const MCInstrDesc &II = MIB->getDesc(); + unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0; + Addr.setReg( + constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx)); + Addr.setOffsetReg( + constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1)); + if (Addr.getOffsetReg()) { + assert(Addr.getOffset() == 0 && "Unexpected offset"); + bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW || + Addr.getExtendType() == AArch64_AM::SXTX; + MIB.addReg(Addr.getReg()); + MIB.addReg(Addr.getOffsetReg()); + MIB.addImm(IsSigned); + MIB.addImm(Addr.getShift() != 0); + } else + MIB.addReg(Addr.getReg()).addImm(Offset); } + + if (MMO) + MIB.addMemOperand(MMO); } -bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, - bool UseUnscaled) { - // Negative offsets require unscaled, 9-bit, signed immediate offsets. - // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. - if (!UseUnscaled && Addr.getOffset() < 0) - UseUnscaled = true; +unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, + const Value *RHS, bool SetFlags, + bool WantResult, bool IsZExt) { + AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend; + bool NeedExtend = false; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + NeedExtend = true; + break; + case MVT::i8: + NeedExtend = true; + ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB; + break; + case MVT::i16: + NeedExtend = true; + ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH; + break; + case MVT::i32: // fall-through + case MVT::i64: + break; + } + MVT SrcVT = RetVT; + RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32); + + // Canonicalize immediates to the RHS first. + if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS)) + std::swap(LHS, RHS); + + // Canonicalize mul by power of 2 to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + + // Canonicalize shift immediate to the RHS. + if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast<BinaryOperator>(LHS)) + if (isa<ConstantInt>(SI->getOperand(1))) + if (SI->getOpcode() == Instruction::Shl || + SI->getOpcode() == Instruction::LShr || + SI->getOpcode() == Instruction::AShr ) + std::swap(LHS, RHS); + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return 0; + bool LHSIsKill = hasTrivialKill(LHS); - unsigned Opc; + if (NeedExtend) + LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt); + + unsigned ResultReg = 0; + if (const auto *C = dyn_cast<ConstantInt>(RHS)) { + uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue(); + if (C->isNegative()) + ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm, + SetFlags, WantResult); + else + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags, + WantResult); + } else if (const auto *C = dyn_cast<Constant>(RHS)) + if (C->isNullValue()) + ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags, + WantResult); + + if (ResultReg) + return ResultReg; + + // Only extend the RHS within the instruction if there is a valid extend type. + if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() && + isValueAvailable(RHS)) { + if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) + if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ExtendType, C->getZExtValue(), + SetFlags, WantResult); + } + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + ExtendType, 0, SetFlags, WantResult); + } + + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); + const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + } + + // Check if the shift can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) { + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { + AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; + switch (SI->getOpcode()) { + default: break; + case Instruction::Shl: ShiftType = AArch64_AM::LSL; break; + case Instruction::LShr: ShiftType = AArch64_AM::LSR; break; + case Instruction::AShr: ShiftType = AArch64_AM::ASR; break; + } + uint64_t ShiftVal = C->getZExtValue(); + if (ShiftType != AArch64_AM::InvalidShiftExtend) { + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftVal, SetFlags, + WantResult); + } + } + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + + if (NeedExtend) + RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt); + + return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + SetFlags, WantResult); +} + +unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrr, AArch64::SUBXrr }, + { AArch64::ADDWrr, AArch64::ADDXrr } }, + { { AArch64::SUBSWrr, AArch64::SUBSXrr }, + { AArch64::ADDSWrr, AArch64::ADDSXrr } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, uint64_t Imm, + bool SetFlags, bool WantResult) { + assert(LHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + unsigned ShiftImm; + if (isUInt<12>(Imm)) + ShiftImm = 0; + else if ((Imm & 0xfff000) == Imm) { + ShiftImm = 12; + Imm >>= 12; + } else + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWri, AArch64::SUBXri }, + { AArch64::ADDWri, AArch64::ADDXri } }, + { { AArch64::SUBSWri, AArch64::SUBSXri }, + { AArch64::ADDSWri, AArch64::ADDSXri } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; const TargetRegisterClass *RC; - bool VTIsi1 = false; - int64_t ScaleFactor = 0; + if (SetFlags) + RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + else + RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addImm(Imm) + .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrs, AArch64::SUBXrs }, + { AArch64::ADDWrs, AArch64::ADDXrs } }, + { { AArch64::SUBSWrs, AArch64::SUBSXrs }, + { AArch64::ADDSWrs, AArch64::ADDSXrs } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addImm(getShifterImm(ShiftType, ShiftImm)); + return ResultReg; +} + +unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ExtType, + uint64_t ShiftImm, bool SetFlags, + bool WantResult) { + assert(LHSReg && RHSReg && "Invalid register number."); + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return 0; + + static const unsigned OpcTable[2][2][2] = { + { { AArch64::SUBWrx, AArch64::SUBXrx }, + { AArch64::ADDWrx, AArch64::ADDXrx } }, + { { AArch64::SUBSWrx, AArch64::SUBSXrx }, + { AArch64::ADDSWrx, AArch64::ADDSXrx } } + }; + bool Is64Bit = RetVT == MVT::i64; + unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit]; + const TargetRegisterClass *RC = nullptr; + if (SetFlags) + RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + else + RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass; + unsigned ResultReg; + if (WantResult) + ResultReg = createResultReg(RC); + else + ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR; + + const MCInstrDesc &II = TII.get(Opc); + LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs()); + RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)) + .addImm(getArithExtendImm(ExtType, ShiftImm)); + return ResultReg; +} + +bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) { + Type *Ty = LHS->getType(); + EVT EVT = TLI.getValueType(Ty, true); + if (!EVT.isSimple()) + return false; + MVT VT = EVT.getSimpleVT(); + switch (VT.SimpleTy) { default: return false; case MVT::i1: - VTIsi1 = true; - // Intentional fall-through. case MVT::i8: - Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui; + case MVT::i16: + case MVT::i32: + case MVT::i64: + return emitICmp(VT, LHS, RHS, IsZExt); + case MVT::f32: + case MVT::f64: + return emitFCmp(VT, LHS, RHS); + } +} + +bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, + bool IsZExt) { + return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false, + IsZExt) != 0; +} + +bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm, + /*SetFlags=*/true, /*WantResult=*/false) != 0; +} + +bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) { + if (RetVT != MVT::f32 && RetVT != MVT::f64) + return false; + + // Check to see if the 2nd operand is a constant that we can encode directly + // in the compare. + bool UseImm = false; + if (const auto *CFP = dyn_cast<ConstantFP>(RHS)) + if (CFP->isZero() && !CFP->isNegative()) + UseImm = true; + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + if (UseImm) { + unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + return true; + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(LHSReg, getKillRegState(LHSIsKill)) + .addReg(RHSReg, getKillRegState(RHSIsKill)); + return true; +} + +unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags, bool WantResult, bool IsZExt) { + return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult, + IsZExt); +} + +/// \brief This method is a wrapper to simplify add emission. +/// +/// First try to emit an add with an immediate operand using emitAddSub_ri. If +/// that fails, then try to materialize the immediate into a register and use +/// emitAddSub_rr instead. +unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, + int64_t Imm) { + unsigned ResultReg; + if (Imm < 0) + ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm); + else + ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm); + + if (ResultReg) + return ResultReg; + + unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm); + if (!CReg) + return 0; + + ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true); + return ResultReg; +} + +unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS, + bool SetFlags, bool WantResult, bool IsZExt) { + return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult, + IsZExt); +} + +unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, bool WantResult) { + return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, /*SetFlags=*/true, WantResult); +} + +unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg, + bool LHSIsKill, unsigned RHSReg, + bool RHSIsKill, + AArch64_AM::ShiftExtendType ShiftType, + uint64_t ShiftImm, bool WantResult) { + return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true, + WantResult); +} + +unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, + const Value *LHS, const Value *RHS) { + // Canonicalize immediates to the RHS first. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS)) + std::swap(LHS, RHS); + + // Canonicalize mul by power-of-2 to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (isMulPowOf2(LHS)) + std::swap(LHS, RHS); + + // Canonicalize shift immediate to the RHS. + if (LHS->hasOneUse() && isValueAvailable(LHS)) + if (const auto *SI = dyn_cast<ShlOperator>(LHS)) + if (isa<ConstantInt>(SI->getOperand(1))) + std::swap(LHS, RHS); + + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return 0; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned ResultReg = 0; + if (const auto *C = dyn_cast<ConstantInt>(RHS)) { + uint64_t Imm = C->getZExtValue(); + ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm); + } + if (ResultReg) + return ResultReg; + + // Check if the mul can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (isMulPowOf2(RHS)) { + const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); + const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(MulLHS)) + if (C->getValue().isPowerOf2()) + std::swap(MulLHS, MulRHS); + + assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt."); + uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2(); + + unsigned RHSReg = getRegForValue(MulLHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(MulLHS); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } + + // Check if the shift can be folded into the instruction. + if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (const auto *SI = dyn_cast<ShlOperator>(RHS)) + if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { + uint64_t ShiftVal = C->getZExtValue(); + unsigned RHSReg = getRegForValue(SI->getOperand(0)); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); + return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + } + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return 0; + bool RHSIsKill = hasTrivialKill(RHS); + + MVT VT = std::max(MVT::i32, RetVT.SimpleTy); + ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, + unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && + "ISD nodes are not consecutive!"); + static const unsigned OpcTable[3][2] = { + { AArch64::ANDWri, AArch64::ANDXri }, + { AArch64::ORRWri, AArch64::ORRXri }, + { AArch64::EORWri, AArch64::EORXri } + }; + const TargetRegisterClass *RC; + unsigned Opc; + unsigned RegSize; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: { + unsigned Idx = ISDOpc - ISD::AND; + Opc = OpcTable[Idx][0]; + RC = &AArch64::GPR32spRegClass; + RegSize = 32; + break; + } + case MVT::i64: + Opc = OpcTable[ISDOpc - ISD::AND][1]; + RC = &AArch64::GPR64spRegClass; + RegSize = 64; + break; + } + + if (!AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return 0; + + unsigned ResultReg = + fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill, + AArch64_AM::encodeLogicalImmediate(Imm, RegSize)); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, + unsigned LHSReg, bool LHSIsKill, + unsigned RHSReg, bool RHSIsKill, + uint64_t ShiftImm) { + assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) && + "ISD nodes are not consecutive!"); + static const unsigned OpcTable[3][2] = { + { AArch64::ANDWrs, AArch64::ANDXrs }, + { AArch64::ORRWrs, AArch64::ORRXrs }, + { AArch64::EORWrs, AArch64::EORXrs } + }; + const TargetRegisterClass *RC; + unsigned Opc; + switch (RetVT.SimpleTy) { + default: + return 0; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + Opc = OpcTable[ISDOpc - ISD::AND][0]; RC = &AArch64::GPR32RegClass; + break; + case MVT::i64: + Opc = OpcTable[ISDOpc - ISD::AND][1]; + RC = &AArch64::GPR64RegClass; + break; + } + unsigned ResultReg = + fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill, + AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm)); + if (RetVT >= MVT::i8 && RetVT <= MVT::i16) { + uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff; + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + } + return ResultReg; +} + +unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, + uint64_t Imm) { + return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm); +} + +unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr, + bool WantZExt, MachineMemOperand *MMO) { + // Simplify this down to something we can handle. + if (!simplifyAddress(Addr, VT)) + return 0; + + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); + + // Negative offsets require unscaled, 9-bit, signed immediate offsets. + // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. + bool UseScaled = true; + if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) { + UseScaled = false; ScaleFactor = 1; + } + + static const unsigned GPOpcTable[2][8][4] = { + // Sign-extend. + { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi, + AArch64::LDURXi }, + { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui, + AArch64::LDRXui }, + { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX, + AArch64::LDRXroX }, + { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW, + AArch64::LDRXroW } + }, + // Zero-extend. + { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi, + AArch64::LDURXi }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui, + AArch64::LDRXui }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX, + AArch64::LDRXroX }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW }, + { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW, + AArch64::LDRXroW } + } + }; + + static const unsigned FPOpcTable[4][2] = { + { AArch64::LDURSi, AArch64::LDURDi }, + { AArch64::LDRSui, AArch64::LDRDui }, + { AArch64::LDRSroX, AArch64::LDRDroX }, + { AArch64::LDRSroW, AArch64::LDRDroW } + }; + + unsigned Opc; + const TargetRegisterClass *RC; + bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && + Addr.getOffsetReg(); + unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; + if (Addr.getExtendType() == AArch64_AM::UXTW || + Addr.getExtendType() == AArch64_AM::SXTW) + Idx++; + + bool IsRet64Bit = RetVT == MVT::i64; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Unexpected value type."); + case MVT::i1: // Intentional fall-through. + case MVT::i8: + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; break; case MVT::i16: - Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui; - RC = &AArch64::GPR32RegClass; - ScaleFactor = 2; + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; break; case MVT::i32: - Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui; - RC = &AArch64::GPR32RegClass; - ScaleFactor = 4; + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2]; + RC = (IsRet64Bit && !WantZExt) ? + &AArch64::GPR64RegClass: &AArch64::GPR32RegClass; break; case MVT::i64: - Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui; + Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3]; RC = &AArch64::GPR64RegClass; - ScaleFactor = 8; break; case MVT::f32: - Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui; - RC = TLI.getRegClassFor(VT); - ScaleFactor = 4; + Opc = FPOpcTable[Idx][0]; + RC = &AArch64::FPR32RegClass; break; case MVT::f64: - Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui; - RC = TLI.getRegClassFor(VT); - ScaleFactor = 8; + Opc = FPOpcTable[Idx][1]; + RC = &AArch64::FPR64RegClass; break; } - // Scale the offset. - if (!UseUnscaled) { - int64_t Offset = Addr.getOffset(); - if (Offset & (ScaleFactor - 1)) - // Retry using an unscaled, 9-bit, signed immediate offset. - return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true); - - Addr.setOffset(Offset / ScaleFactor); - } - - // Simplify this down to something we can handle. - if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) - return false; // Create the base instruction, then add the operands. - ResultReg = createResultReg(RC); + unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); - AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled); + addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO); // Loading an i1 requires special handling. - if (VTIsi1) { - MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass); - unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), - ANDReg) - .addReg(ResultReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + if (VT == MVT::i1) { + unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1); + assert(ANDReg && "Unexpected AND instruction emission failure."); ResultReg = ANDReg; } + + // For zero-extending loads to 64bit we emit a 32bit load and then convert + // the 32bit reg to a 64bit reg. + if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg, getKillRegState(true)) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; +} + +bool AArch64FastISel::selectAddSub(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) + return false; + + if (VT.isVector()) + return selectOperator(I, I->getOpcode()); + + unsigned ResultReg; + switch (I->getOpcode()) { + default: + llvm_unreachable("Unexpected instruction."); + case Instruction::Add: + ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Sub: + ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1)); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectLogicalOp(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) + return false; + + if (VT.isVector()) + return selectOperator(I, I->getOpcode()); + + unsigned ResultReg; + switch (I->getOpcode()) { + default: + llvm_unreachable("Unexpected instruction."); + case Instruction::And: + ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Or: + ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1)); + break; + case Instruction::Xor: + ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1)); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectLoad(const Instruction *I) { +bool AArch64FastISel::selectLoad(const Instruction *I) { MVT VT; // Verify we have a legal type before going any further. Currently, we handle // simple types that will directly fit in a register (i32/f32/i64/f64) or // those that can be sign or zero-extended to a basic operation (i1/i8/i16). - if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic()) + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) || + cast<LoadInst>(I)->isAtomic()) return false; // See if we can handle this address. Address Addr; - if (!ComputeAddress(I->getOperand(0), Addr)) + if (!computeAddress(I->getOperand(0), Addr, I->getType())) return false; - unsigned ResultReg; - if (!EmitLoad(VT, ResultReg, Addr)) + // Fold the following sign-/zero-extend into the load instruction. + bool WantZExt = true; + MVT RetVT = VT; + const Value *IntExtVal = nullptr; + if (I->hasOneUse()) { + if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) { + if (isTypeSupported(ZE->getType(), RetVT)) + IntExtVal = ZE; + else + RetVT = VT; + } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) { + if (isTypeSupported(SE->getType(), RetVT)) + IntExtVal = SE; + else + RetVT = VT; + WantZExt = false; + } + } + + unsigned ResultReg = + emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I)); + if (!ResultReg) return false; - UpdateValueMap(I, ResultReg); + // There are a few different cases we have to handle, because the load or the + // sign-/zero-extend might not be selected by FastISel if we fall-back to + // SelectionDAG. There is also an ordering issue when both instructions are in + // different basic blocks. + // 1.) The load instruction is selected by FastISel, but the integer extend + // not. This usually happens when the integer extend is in a different + // basic block and SelectionDAG took over for that basic block. + // 2.) The load instruction is selected before the integer extend. This only + // happens when the integer extend is in a different basic block. + // 3.) The load instruction is selected by SelectionDAG and the integer extend + // by FastISel. This happens if there are instructions between the load + // and the integer extend that couldn't be selected by FastISel. + if (IntExtVal) { + // The integer extend hasn't been emitted yet. FastISel or SelectionDAG + // could select it. Emit a copy to subreg if necessary. FastISel will remove + // it when it selects the integer extend. + unsigned Reg = lookUpRegForValue(IntExtVal); + if (!Reg) { + if (RetVT == MVT::i64 && VT <= MVT::i32) { + if (WantZExt) { + // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG). + std::prev(FuncInfo.InsertPt)->eraseFromParent(); + ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg(); + } else + ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, + /*IsKill=*/true, + AArch64::sub_32); + } + updateValueMap(I, ResultReg); + return true; + } + + // The integer extend has already been emitted - delete all the instructions + // that have been emitted by the integer extend lowering code and use the + // result from the load instruction directly. + while (Reg) { + auto *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) + break; + Reg = 0; + for (auto &Opnd : MI->uses()) { + if (Opnd.isReg()) { + Reg = Opnd.getReg(); + break; + } + } + MI->eraseFromParent(); + } + updateValueMap(IntExtVal, ResultReg); + return true; + } + + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr, - bool UseUnscaled) { +bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, + MachineMemOperand *MMO) { + // Simplify this down to something we can handle. + if (!simplifyAddress(Addr, VT)) + return false; + + unsigned ScaleFactor = getImplicitScaleFactor(VT); + if (!ScaleFactor) + llvm_unreachable("Unexpected value type."); + // Negative offsets require unscaled, 9-bit, signed immediate offsets. // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. - if (!UseUnscaled && Addr.getOffset() < 0) - UseUnscaled = true; - - unsigned StrOpc; - bool VTIsi1 = false; - int64_t ScaleFactor = 0; - // Using scaled, 12-bit, unsigned immediate offsets. - switch (VT.SimpleTy) { - default: - return false; - case MVT::i1: - VTIsi1 = true; - case MVT::i8: - StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui; + bool UseScaled = true; + if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) { + UseScaled = false; ScaleFactor = 1; - break; - case MVT::i16: - StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui; - ScaleFactor = 2; - break; - case MVT::i32: - StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui; - ScaleFactor = 4; - break; - case MVT::i64: - StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui; - ScaleFactor = 8; - break; - case MVT::f32: - StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui; - ScaleFactor = 4; - break; - case MVT::f64: - StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui; - ScaleFactor = 8; - break; } - // Scale the offset. - if (!UseUnscaled) { - int64_t Offset = Addr.getOffset(); - if (Offset & (ScaleFactor - 1)) - // Retry using an unscaled, 9-bit, signed immediate offset. - return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true); - Addr.setOffset(Offset / ScaleFactor); - } + static const unsigned OpcTable[4][6] = { + { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi, + AArch64::STURSi, AArch64::STURDi }, + { AArch64::STRBBui, AArch64::STRHHui, AArch64::STRWui, AArch64::STRXui, + AArch64::STRSui, AArch64::STRDui }, + { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX, + AArch64::STRSroX, AArch64::STRDroX }, + { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW, + AArch64::STRSroW, AArch64::STRDroW } + }; - // Simplify this down to something we can handle. - if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) - return false; + unsigned Opc; + bool VTIsi1 = false; + bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() && + Addr.getOffsetReg(); + unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0; + if (Addr.getExtendType() == AArch64_AM::UXTW || + Addr.getExtendType() == AArch64_AM::SXTW) + Idx++; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i1: VTIsi1 = true; + case MVT::i8: Opc = OpcTable[Idx][0]; break; + case MVT::i16: Opc = OpcTable[Idx][1]; break; + case MVT::i32: Opc = OpcTable[Idx][2]; break; + case MVT::i64: Opc = OpcTable[Idx][3]; break; + case MVT::f32: Opc = OpcTable[Idx][4]; break; + case MVT::f64: Opc = OpcTable[Idx][5]; break; + } // Storing an i1 requires special handling. - if (VTIsi1) { - MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass); - unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), - ANDReg) - .addReg(SrcReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + if (VTIsi1 && SrcReg != AArch64::WZR) { + unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + assert(ANDReg && "Unexpected AND instruction emission failure."); SrcReg = ANDReg; } // Create the base instruction, then add the operands. - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(StrOpc)).addReg(SrcReg); - AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled); + const MCInstrDesc &II = TII.get(Opc); + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg); + addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO); + return true; } -bool AArch64FastISel::SelectStore(const Instruction *I) { +bool AArch64FastISel::selectStore(const Instruction *I) { MVT VT; - Value *Op0 = I->getOperand(0); + const Value *Op0 = I->getOperand(0); // Verify we have a legal type before going any further. Currently, we handle // simple types that will directly fit in a register (i32/f32/i64/f64) or // those that can be sign or zero-extended to a basic operation (i1/i8/i16). - if (!isLoadStoreTypeLegal(Op0->getType(), VT) || + if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true) || cast<StoreInst>(I)->isAtomic()) return false; - // Get the value to be stored into a register. - unsigned SrcReg = getRegForValue(Op0); - if (SrcReg == 0) + // Get the value to be stored into a register. Use the zero register directly + // when possible to avoid an unnecessary copy and a wasted register. + unsigned SrcReg = 0; + if (const auto *CI = dyn_cast<ConstantInt>(Op0)) { + if (CI->isZero()) + SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) { + if (CF->isZero() && !CF->isNegative()) { + VT = MVT::getIntegerVT(VT.getSizeInBits()); + SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + } + } + + if (!SrcReg) + SrcReg = getRegForValue(Op0); + + if (!SrcReg) return false; // See if we can handle this address. Address Addr; - if (!ComputeAddress(I->getOperand(1), Addr)) + if (!computeAddress(I->getOperand(1), Addr, I->getOperand(0)->getType())) return false; - if (!EmitStore(VT, SrcReg, Addr)) + if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I))) return false; return true; } @@ -757,58 +2103,235 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { } } -bool AArch64FastISel::SelectBranch(const Instruction *I) { +/// \brief Try to emit a combined compare-and-branch instruction. +bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { + assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction"); + const CmpInst *CI = cast<CmpInst>(BI->getCondition()); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + MVT VT; + if (!isTypeSupported(LHS->getType(), VT)) + return false; + + unsigned BW = VT.getSizeInBits(); + if (BW > 64) + return false; + + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + int TestBit = -1; + bool IsCmpNE; + switch (Predicate) { + default: + return false; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue()) + std::swap(LHS, RHS); + + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) + return false; + + if (const auto *AI = dyn_cast<BinaryOperator>(LHS)) + if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) { + const Value *AndLHS = AI->getOperand(0); + const Value *AndRHS = AI->getOperand(1); + + if (const auto *C = dyn_cast<ConstantInt>(AndLHS)) + if (C->getValue().isPowerOf2()) + std::swap(AndLHS, AndRHS); + + if (const auto *C = dyn_cast<ConstantInt>(AndRHS)) + if (C->getValue().isPowerOf2()) { + TestBit = C->getValue().logBase2(); + LHS = AndLHS; + } + } + + if (VT == MVT::i1) + TestBit = 0; + + IsCmpNE = Predicate == CmpInst::ICMP_NE; + break; + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SGE: + if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue()) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLT; + break; + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SLE: + if (!isa<ConstantInt>(RHS)) + return false; + + if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true)) + return false; + + TestBit = BW - 1; + IsCmpNE = Predicate == CmpInst::ICMP_SLE; + break; + } // end switch + + static const unsigned OpcTable[2][2][2] = { + { {AArch64::CBZW, AArch64::CBZX }, + {AArch64::CBNZW, AArch64::CBNZX} }, + { {AArch64::TBZW, AArch64::TBZX }, + {AArch64::TBNZW, AArch64::TBNZX} } + }; + + bool IsBitTest = TestBit != -1; + bool Is64Bit = BW == 64; + if (TestBit < 32 && TestBit >= 0) + Is64Bit = false; + + unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit]; + const MCInstrDesc &II = TII.get(Opc); + + unsigned SrcReg = getRegForValue(LHS); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(LHS); + + if (BW == 64 && !Is64Bit) + SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, + AArch64::sub_32); + + if ((BW < 32) && !IsBitTest) + SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true); + + // Emit the combined compare and branch instruction. + SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs()); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg, getKillRegState(SrcIsKill)); + if (IsBitTest) + MIB.addImm(TestBit); + MIB.addMBB(TBB); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + fastEmitBranch(FBB, DbgLoc); + + return true; +} + +bool AArch64FastISel::selectBranch(const Instruction *I) { const BranchInst *BI = cast<BranchInst>(I); + if (BI->isUnconditional()) { + MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)]; + fastEmitBranch(MSucc, BI->getDebugLoc()); + return true; + } + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + AArch64CC::CondCode CC = AArch64CC::NE; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { - if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { - // We may not handle every CC for now. - AArch64CC::CondCode CC = getCompareCC(CI->getPredicate()); - if (CC == AArch64CC::AL) - return false; + if (CI->hasOneUse() && isValueAvailable(CI)) { + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + fastEmitBranch(FBB, DbgLoc); + return true; + case CmpInst::FCMP_TRUE: + fastEmitBranch(TBB, DbgLoc); + return true; + } + + // Try to emit a combined compare-and-branch first. + if (emitCompareAndBranch(BI)) + return true; + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } // Emit the cmp. - if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) return false; + // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch + // instruction. + CC = getCompareCC(Predicate); + AArch64CC::CondCode ExtraCC = AArch64CC::AL; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + + // Emit the extra branch for FCMP_UEQ and FCMP_ONE. + if (ExtraCC != AArch64CC::AL) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(ExtraCC) + .addMBB(TBB); + } + // Emit the branch. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) .addImm(CC) .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - FastEmitBranch(FBB, DbgLoc); + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + + fastEmitBranch(FBB, DbgLoc); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { MVT SrcVT; - if (TI->hasOneUse() && TI->getParent() == I->getParent() && - (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) { + if (TI->hasOneUse() && isValueAvailable(TI) && + isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) { unsigned CondReg = getRegForValue(TI->getOperand(0)); - if (CondReg == 0) + if (!CondReg) return false; + bool CondIsKill = hasTrivialKill(TI->getOperand(0)); // Issue an extract_subreg to get the lower 32-bits. - if (SrcVT == MVT::i64) - CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true, + if (SrcVT == MVT::i64) { + CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill, AArch64::sub_32); + CondIsKill = true; + } - MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass); - unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(AArch64::ANDWri), ANDReg) - .addReg(CondReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(AArch64::SUBSWri)) - .addReg(ANDReg) - .addReg(ANDReg) - .addImm(0) - .addImm(0); + unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1); + assert(ANDReg && "Unexpected AND instruction emission failure."); + emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0); - unsigned CC = AArch64CC::NE; if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); CC = AArch64CC::EQ; @@ -816,23 +2339,57 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) .addImm(CC) .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - FastEmitBranch(FBB, DbgLoc); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + + fastEmitBranch(FBB, DbgLoc); return true; } - } else if (const ConstantInt *CI = - dyn_cast<ConstantInt>(BI->getCondition())) { + } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { uint64_t Imm = CI->getZExtValue(); MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) .addMBB(Target); - FuncInfo.MBB->addSuccessor(Target); + + // Obtain the branch weight and add the target to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + Target->getBasicBlock()); + FuncInfo.MBB->addSuccessor(Target, BranchWeight); + return true; + } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned CondReg = getRegForValue(BI->getCondition()); + if (!CondReg) + return false; + + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + + fastEmitBranch(FBB, DbgLoc); return true; } unsigned CondReg = getRegForValue(BI->getCondition()); if (CondReg == 0) return false; + bool CondRegIsKill = hasTrivialKill(BI->getCondition()); // We've been divorced from our compare! Our block was split, and // now our compare lives in a predecessor block. We musn't @@ -841,13 +2398,8 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) { // Regardless, the compare has been done in the predecessor block, // and it left a value for us in a virtual register. Ergo, we test // the one-bit value left in the virtual register. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri), - AArch64::WZR) - .addReg(CondReg) - .addImm(0) - .addImm(0); + emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0); - unsigned CC = AArch64CC::NE; if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); CC = AArch64CC::EQ; @@ -856,20 +2408,28 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) .addImm(CC) .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - FastEmitBranch(FBB, DbgLoc); + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + + fastEmitBranch(FBB, DbgLoc); return true; } -bool AArch64FastISel::SelectIndirectBr(const Instruction *I) { +bool AArch64FastISel::selectIndirectBr(const Instruction *I) { const IndirectBrInst *BI = cast<IndirectBrInst>(I); unsigned AddrReg = getRegForValue(BI->getOperand(0)); if (AddrReg == 0) return false; // Emit the indirect branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR)) - .addReg(AddrReg); + const MCInstrDesc &II = TII.get(AArch64::BR); + AddrReg = constrainOperandRegClass(II, AddrReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg); // Make sure the CFG is up-to-date. for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) @@ -878,211 +2438,271 @@ bool AArch64FastISel::SelectIndirectBr(const Instruction *I) { return true; } -bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) { - Type *Ty = Src1Value->getType(); - EVT SrcEVT = TLI.getValueType(Ty, true); - if (!SrcEVT.isSimple()) - return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - - // Check to see if the 2nd operand is a constant that we can encode directly - // in the compare. - uint64_t Imm; - bool UseImm = false; - bool isNegativeImm = false; - if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) { - if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || - SrcVT == MVT::i8 || SrcVT == MVT::i1) { - const APInt &CIVal = ConstInt->getValue(); - - Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue(); - if (CIVal.isNegative()) { - isNegativeImm = true; - Imm = -Imm; - } - // FIXME: We can handle more immediates using shifts. - UseImm = ((Imm & 0xfff) == Imm); - } - } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) { - if (SrcVT == MVT::f32 || SrcVT == MVT::f64) - if (ConstFP->isZero() && !ConstFP->isNegative()) - UseImm = true; - } +bool AArch64FastISel::selectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); - unsigned ZReg; - unsigned CmpOpc; - bool isICmp = true; - bool needsExt = false; - switch (SrcVT.SimpleTy) { + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { default: - return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - needsExt = true; - // Intentional fall-through. - case MVT::i32: - ZReg = AArch64::WZR; - if (UseImm) - CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri; - else - CmpOpc = AArch64::SUBSWrr; break; - case MVT::i64: - ZReg = AArch64::XZR; - if (UseImm) - CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri; - else - CmpOpc = AArch64::SUBSXrr; - break; - case MVT::f32: - isICmp = false; - CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr; + case CmpInst::FCMP_FALSE: + ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(AArch64::WZR, getKillRegState(true)); break; - case MVT::f64: - isICmp = false; - CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr; + case CmpInst::FCMP_TRUE: + ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1); break; } - unsigned SrcReg1 = getRegForValue(Src1Value); - if (SrcReg1 == 0) - return false; - - unsigned SrcReg2; - if (!UseImm) { - SrcReg2 = getRegForValue(Src2Value); - if (SrcReg2 == 0) - return false; + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; } - // We have i1, i8, or i16, we need to either zero extend or sign extend. - if (needsExt) { - SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt); - if (SrcReg1 == 0) - return false; - if (!UseImm) { - SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt); - if (SrcReg2 == 0) - return false; - } - } + // Emit the cmp. + if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; - if (isICmp) { - if (UseImm) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(ZReg) - .addReg(SrcReg1) - .addImm(Imm) - .addImm(0); - else - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(ZReg) - .addReg(SrcReg1) - .addReg(SrcReg2); - } else { - if (UseImm) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(SrcReg1); - else - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(SrcReg1) - .addReg(SrcReg2); - } - return true; -} + ResultReg = createResultReg(&AArch64::GPR32RegClass); -bool AArch64FastISel::SelectCmp(const Instruction *I) { - const CmpInst *CI = cast<CmpInst>(I); + // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These + // condition codes are inverted, because they are used by CSINC. + static unsigned CondCodeTable[2][2] = { + { AArch64CC::NE, AArch64CC::VC }, + { AArch64CC::PL, AArch64CC::LE } + }; + unsigned *CondCodes = nullptr; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + CondCodes = &CondCodeTable[0][0]; + break; + case CmpInst::FCMP_ONE: + CondCodes = &CondCodeTable[1][0]; + break; + } - // We may not handle every CC for now. - AArch64CC::CondCode CC = getCompareCC(CI->getPredicate()); - if (CC == AArch64CC::AL) - return false; + if (CondCodes) { + unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + TmpReg1) + .addReg(AArch64::WZR, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) + .addImm(CondCodes[0]); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + ResultReg) + .addReg(TmpReg1, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) + .addImm(CondCodes[1]); - // Emit the cmp. - if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) - return false; + updateValueMap(I, ResultReg); + return true; + } // Now set a register based on the comparison. + AArch64CC::CondCode CC = getCompareCC(Predicate); + assert((CC != AArch64CC::AL) && "Unexpected condition code."); AArch64CC::CondCode invertedCC = getInvertedCondCode(CC); - unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), ResultReg) - .addReg(AArch64::WZR) - .addReg(AArch64::WZR) + .addReg(AArch64::WZR, getKillRegState(true)) + .addReg(AArch64::WZR, getKillRegState(true)) .addImm(invertedCC); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectSelect(const Instruction *I) { - const SelectInst *SI = cast<SelectInst>(I); - - EVT DestEVT = TLI.getValueType(SI->getType(), true); - if (!DestEVT.isSimple()) +/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false' +/// value. +bool AArch64FastISel::optimizeSelect(const SelectInst *SI) { + if (!SI->getType()->isIntegerTy(1)) return false; - MVT DestVT = DestEVT.getSimpleVT(); - if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 && - DestVT != MVT::f64) - return false; + const Value *Src1Val, *Src2Val; + unsigned Opc = 0; + bool NeedExtraOp = false; + if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getFalseValue(); + Opc = AArch64::ORRWrr; + } else { + assert(CI->isZero()); + Src1Val = SI->getFalseValue(); + Src2Val = SI->getCondition(); + Opc = AArch64::BICWrr; + } + } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) { + if (CI->isOne()) { + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ORRWrr; + NeedExtraOp = true; + } else { + assert(CI->isZero()); + Src1Val = SI->getCondition(); + Src2Val = SI->getTrueValue(); + Opc = AArch64::ANDWrr; + } + } - unsigned CondReg = getRegForValue(SI->getCondition()); - if (CondReg == 0) - return false; - unsigned TrueReg = getRegForValue(SI->getTrueValue()); - if (TrueReg == 0) + if (!Opc) return false; - unsigned FalseReg = getRegForValue(SI->getFalseValue()); - if (FalseReg == 0) + + unsigned Src1Reg = getRegForValue(Src1Val); + if (!Src1Reg) return false; + bool Src1IsKill = hasTrivialKill(Src1Val); + unsigned Src2Reg = getRegForValue(Src2Val); + if (!Src2Reg) + return false; + bool Src2IsKill = hasTrivialKill(Src2Val); - MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass); - unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), - ANDReg) - .addReg(CondReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + if (NeedExtraOp) { + Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1); + Src1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32spRegClass, Src1Reg, + Src1IsKill, Src2Reg, Src2IsKill); + updateValueMap(SI, ResultReg); + return true; +} - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri)) - .addReg(ANDReg) - .addReg(ANDReg) - .addImm(0) - .addImm(0); +bool AArch64FastISel::selectSelect(const Instruction *I) { + assert(isa<SelectInst>(I) && "Expected a select instruction."); + MVT VT; + if (!isTypeSupported(I->getType(), VT)) + return false; - unsigned SelectOpc; - switch (DestVT.SimpleTy) { + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: case MVT::i32: - SelectOpc = AArch64::CSELWr; + Opc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; break; case MVT::i64: - SelectOpc = AArch64::CSELXr; + Opc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; break; case MVT::f32: - SelectOpc = AArch64::FCSELSrrr; + Opc = AArch64::FCSELSrrr; + RC = &AArch64::FPR32RegClass; break; case MVT::f64: - SelectOpc = AArch64::FCSELDrrr; + Opc = AArch64::FCSELDrrr; + RC = &AArch64::FPR64RegClass; break; } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc), - ResultReg) - .addReg(TrueReg) - .addReg(FalseReg) - .addImm(AArch64CC::NE); + const SelectInst *SI = cast<SelectInst>(I); + const Value *Cond = SI->getCondition(); + AArch64CC::CondCode CC = AArch64CC::NE; + AArch64CC::CondCode ExtraCC = AArch64CC::AL; - UpdateValueMap(I, ResultReg); + if (optimizeSelect(SI)) + return true; + + // Try to pickup the flags, so we don't have to emit another compare. + if (foldXALUIntrinsic(CC, I, Cond)) { + // Fake request the condition to force emission of the XALU intrinsic. + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() && + isValueAvailable(Cond)) { + const auto *Cmp = cast<CmpInst>(Cond); + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp); + const Value *FoldSelect = nullptr; + switch (Predicate) { + default: + break; + case CmpInst::FCMP_FALSE: + FoldSelect = SI->getFalseValue(); + break; + case CmpInst::FCMP_TRUE: + FoldSelect = SI->getTrueValue(); + break; + } + + if (FoldSelect) { + unsigned SrcReg = getRegForValue(FoldSelect); + if (!SrcReg) + return false; + unsigned UseReg = lookUpRegForValue(SI); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; + } + + // Emit the cmp. + if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned())) + return false; + + // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction. + CC = getCompareCC(Predicate); + switch (Predicate) { + default: + break; + case CmpInst::FCMP_UEQ: + ExtraCC = AArch64CC::EQ; + CC = AArch64CC::VS; + break; + case CmpInst::FCMP_ONE: + ExtraCC = AArch64CC::MI; + CC = AArch64CC::GT; + break; + } + assert((CC != AArch64CC::AL) && "Unexpected condition code."); + } else { + unsigned CondReg = getRegForValue(Cond); + if (!CondReg) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + // Emit a TST instruction (ANDS wzr, reg, #imm). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDSWri), + AArch64::WZR) + .addReg(CondReg, getKillRegState(CondIsKill)) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + } + + unsigned Src1Reg = getRegForValue(SI->getTrueValue()); + bool Src1IsKill = hasTrivialKill(SI->getTrueValue()); + + unsigned Src2Reg = getRegForValue(SI->getFalseValue()); + bool Src2IsKill = hasTrivialKill(SI->getFalseValue()); + + if (!Src1Reg || !Src2Reg) + return false; + + if (ExtraCC != AArch64CC::AL) { + Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, ExtraCC); + Src2IsKill = true; + } + unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg, + Src2IsKill, CC); + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectFPExt(const Instruction *I) { +bool AArch64FastISel::selectFPExt(const Instruction *I) { Value *V = I->getOperand(0); if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) return false; @@ -1094,11 +2714,11 @@ bool AArch64FastISel::SelectFPExt(const Instruction *I) { unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr), ResultReg).addReg(Op); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectFPTrunc(const Instruction *I) { +bool AArch64FastISel::selectFPTrunc(const Instruction *I) { Value *V = I->getOperand(0); if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy()) return false; @@ -1110,12 +2730,12 @@ bool AArch64FastISel::SelectFPTrunc(const Instruction *I) { unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr), ResultReg).addReg(Op); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } // FPToUI and FPToSI -bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) { +bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) { MVT DestVT; if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) return false; @@ -1144,11 +2764,11 @@ bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) { DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(SrcReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) { +bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) { MVT DestVT; if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) return false; @@ -1156,22 +2776,21 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) { "Unexpected value type."); unsigned SrcReg = getRegForValue(I->getOperand(0)); - if (SrcReg == 0) + if (!SrcReg) return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); // Handle sign-extension. if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) { SrcReg = - EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); - if (SrcReg == 0) + emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); + if (!SrcReg) return false; + SrcIsKill = true; } - MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass - : &AArch64::GPR32RegClass); - unsigned Opc; if (SrcVT == MVT::i64) { if (Signed) @@ -1185,21 +2804,128 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) { Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri; } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg); - UpdateValueMap(I, ResultReg); + unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg, + SrcIsKill); + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::fastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + if (CC != CallingConv::C) + return false; + + // Only handle simple cases of up to 8 GPR and FPR each. + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; + if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || + F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + return false; + + Type *ArgTy = Arg.getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy()) + return false; + + EVT ArgVT = TLI.getValueType(ArgTy); + if (!ArgVT.isSimple()) + return false; + + MVT VT = ArgVT.getSimpleVT().SimpleTy; + if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8()) + return false; + + if (VT.isVector() && + (!Subtarget->hasNEON() || !Subtarget->isLittleEndian())) + return false; + + if (VT >= MVT::i1 && VT <= MVT::i64) + ++GPRCnt; + else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() || + VT.is128BitVector()) + ++FPRCnt; + else + return false; + + if (GPRCnt > 8 || FPRCnt > 8) + return false; + } + + static const MCPhysReg Registers[6][8] = { + { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4, + AArch64::W5, AArch64::W6, AArch64::W7 }, + { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7 }, + { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4, + AArch64::H5, AArch64::H6, AArch64::H7 }, + { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4, + AArch64::S5, AArch64::S6, AArch64::S7 }, + { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4, + AArch64::D5, AArch64::D6, AArch64::D7 }, + { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7 } + }; + + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(Arg.getType()); + unsigned SrcReg; + const TargetRegisterClass *RC; + if (VT >= MVT::i1 && VT <= MVT::i32) { + SrcReg = Registers[0][GPRIdx++]; + RC = &AArch64::GPR32RegClass; + VT = MVT::i32; + } else if (VT == MVT::i64) { + SrcReg = Registers[1][GPRIdx++]; + RC = &AArch64::GPR64RegClass; + } else if (VT == MVT::f16) { + SrcReg = Registers[2][FPRIdx++]; + RC = &AArch64::FPR16RegClass; + } else if (VT == MVT::f32) { + SrcReg = Registers[3][FPRIdx++]; + RC = &AArch64::FPR32RegClass; + } else if ((VT == MVT::f64) || VT.is64BitVector()) { + SrcReg = Registers[4][FPRIdx++]; + RC = &AArch64::FPR64RegClass; + } else if (VT.is128BitVector()) { + SrcReg = Registers[5][FPRIdx++]; + RC = &AArch64::FPR128RegClass; + } else + llvm_unreachable("Unexpected value type."); + + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + updateValueMap(&Arg, ResultReg); + } return true; } -bool AArch64FastISel::ProcessCallArgs( - SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs, - SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, - SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC, - unsigned &NumBytes) { +bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, + SmallVectorImpl<MVT> &OutVTs, + unsigned &NumBytes) { + CallingConv::ID CC = CLI.CallConv; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC)); + CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC)); // Get a count of how many bytes are to be pushed on the stack. NumBytes = CCInfo.getNextStackOffset(); @@ -1207,13 +2933,17 @@ bool AArch64FastISel::ProcessCallArgs( // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes); // Process the args. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - unsigned Arg = ArgRegs[VA.getValNo()]; - MVT ArgVT = ArgVTs[VA.getValNo()]; + const Value *ArgVal = CLI.OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + unsigned ArgReg = getRegForValue(ArgVal); + if (!ArgReg) + return false; // Handle arg promotion: SExt, ZExt, AExt. switch (VA.getLocInfo()) { @@ -1222,8 +2952,8 @@ bool AArch64FastISel::ProcessCallArgs( case CCValAssign::SExt: { MVT DestVT = VA.getLocVT(); MVT SrcVT = ArgVT; - Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false); - if (Arg == 0) + ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false); + if (!ArgReg) return false; break; } @@ -1232,8 +2962,8 @@ bool AArch64FastISel::ProcessCallArgs( case CCValAssign::ZExt: { MVT DestVT = VA.getLocVT(); MVT SrcVT = ArgVT; - Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true); - if (Arg == 0) + ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true); + if (!ArgReg) return false; break; } @@ -1244,14 +2974,18 @@ bool AArch64FastISel::ProcessCallArgs( // Now copy/store arg to correct locations. if (VA.isRegLoc() && !VA.needsCustom()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); - RegArgs.push_back(VA.getLocReg()); + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + CLI.OutRegs.push_back(VA.getLocReg()); } else if (VA.needsCustom()) { // FIXME: Handle custom args. return false; } else { assert(VA.isMemLoc() && "Assuming store on stack."); + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + // Need to store on the stack. unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8; @@ -1264,26 +2998,31 @@ bool AArch64FastISel::ProcessCallArgs( Addr.setReg(AArch64::SP); Addr.setOffset(VA.getLocMemOffset() + BEAlign); - if (!EmitStore(ArgVT, Arg, Addr)) + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(Addr.getOffset()), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + + if (!emitStore(ArgVT, ArgReg, Addr, MMO)) return false; } } return true; } -bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, - const Instruction *I, CallingConv::ID CC, - unsigned &NumBytes) { +bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, + unsigned NumBytes) { + CallingConv::ID CC = CLI.CallConv; + // Issue CALLSEQ_END unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(NumBytes) - .addImm(0); + .addImm(NumBytes).addImm(0); // Now the return value. if (RetVT != MVT::isVoid) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); + CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC)); // Only handle a single return value. @@ -1294,147 +3033,147 @@ bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, MVT CopyVT = RVLocs[0].getValVT(); unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(RVLocs[0].getLocReg()); - UsedRegs.push_back(RVLocs[0].getLocReg()); + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(RVLocs[0].getLocReg()); + CLI.InRegs.push_back(RVLocs[0].getLocReg()); - // Finally update the result. - UpdateValueMap(I, ResultReg); + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = 1; } return true; } -bool AArch64FastISel::SelectCall(const Instruction *I, - const char *IntrMemName = nullptr) { - const CallInst *CI = cast<CallInst>(I); - const Value *Callee = CI->getCalledValue(); +bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { + CallingConv::ID CC = CLI.CallConv; + bool IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + const char *SymName = CLI.SymName; - // Don't handle inline asm or intrinsics. - if (isa<InlineAsm>(Callee)) + if (!Callee && !SymName) return false; - // Only handle global variable Callees. - const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); - if (!GV) + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) return false; - // Check the calling convention. - ImmutableCallSite CS(CI); - CallingConv::ID CC = CS.getCallingConv(); + CodeModel::Model CM = TM.getCodeModel(); + // Only support the small and large code model. + if (CM != CodeModel::Small && CM != CodeModel::Large) + return false; + + // FIXME: Add large code model support for ELF. + if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) + return false; // Let SDISel handle vararg functions. - PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); - FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - if (FTy->isVarArg()) + if (IsVarArg) return false; - // Handle *simple* calls for now. + // FIXME: Only handle *simple* calls for now. MVT RetVT; - Type *RetTy = I->getType(); - if (RetTy->isVoidTy()) + if (CLI.RetTy->isVoidTy()) RetVT = MVT::isVoid; - else if (!isTypeLegal(RetTy, RetVT)) + else if (!isTypeLegal(CLI.RetTy, RetVT)) return false; - // Set up the argument vectors. - SmallVector<Value *, 8> Args; - SmallVector<unsigned, 8> ArgRegs; - SmallVector<MVT, 8> ArgVTs; - SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; - Args.reserve(CS.arg_size()); - ArgRegs.reserve(CS.arg_size()); - ArgVTs.reserve(CS.arg_size()); - ArgFlags.reserve(CS.arg_size()); - - for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); - i != e; ++i) { - // If we're lowering a memory intrinsic instead of a regular call, skip the - // last two arguments, which shouldn't be passed to the underlying function. - if (IntrMemName && e - i <= 2) - break; - - unsigned Arg = getRegForValue(*i); - if (Arg == 0) + for (auto Flag : CLI.OutFlags) + if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal()) return false; - ISD::ArgFlagsTy Flags; - unsigned AttrInd = i - CS.arg_begin() + 1; - if (CS.paramHasAttr(AttrInd, Attribute::SExt)) - Flags.setSExt(); - if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) - Flags.setZExt(); - - // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::StructRet) || - CS.paramHasAttr(AttrInd, Attribute::Nest) || - CS.paramHasAttr(AttrInd, Attribute::ByVal)) - return false; + // Set up the argument vectors. + SmallVector<MVT, 16> OutVTs; + OutVTs.reserve(CLI.OutVals.size()); - MVT ArgVT; - Type *ArgTy = (*i)->getType(); - if (!isTypeLegal(ArgTy, ArgVT) && - !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16)) + for (auto *Val : CLI.OutVals) { + MVT VT; + if (!isTypeLegal(Val->getType(), VT) && + !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) return false; // We don't handle vector parameters yet. - if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) + if (VT.isVector() || VT.getSizeInBits() > 64) return false; - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); - - Args.push_back(*i); - ArgRegs.push_back(Arg); - ArgVTs.push_back(ArgVT); - ArgFlags.push_back(Flags); + OutVTs.push_back(VT); } + Address Addr; + if (Callee && !computeCallAddress(Callee, Addr)) + return false; + // Handle the arguments now that we've gotten them. - SmallVector<unsigned, 4> RegArgs; unsigned NumBytes; - if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + if (!processCallArgs(CLI, OutVTs, NumBytes)) return false; // Issue the call. MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL)); - if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, 0); - else - MIB.addExternalSymbol(IntrMemName, 0); + if (CM == CodeModel::Small) { + const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); + if (SymName) + MIB.addExternalSymbol(SymName, 0); + else if (Addr.getGlobalValue()) + MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0); + else if (Addr.getReg()) { + unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0); + MIB.addReg(Reg); + } else + return false; + } else { + unsigned CallReg = 0; + if (SymName) { + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg) + .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGE); + + CallReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), + CallReg) + .addReg(ADRPReg) + .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } else if (Addr.getGlobalValue()) + CallReg = materializeGV(Addr.getGlobalValue()); + else if (Addr.getReg()) + CallReg = Addr.getReg(); + + if (!CallReg) + return false; + + const MCInstrDesc &II = TII.get(AArch64::BLR); + CallReg = constrainOperandRegClass(II, CallReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg); + } // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (auto Reg : CLI.OutRegs) + MIB.addReg(Reg, RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); - - // Finish off the call including any return values. - SmallVector<unsigned, 4> UsedRegs; - if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) - return false; + MIB.addRegMask(TRI.getCallPreservedMask(CC)); - // Set all unused physreg defs as dead. - static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + CLI.Call = MIB; - return true; + // Finish off the call including any return values. + return finishCall(CLI, RetVT, NumBytes); } -bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) { +bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) { if (Alignment) return Len / Alignment <= 4; else return Len < 32; } -bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, +bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, unsigned Alignment) { // Make sure we don't bloat code by inlining very large memcpy's. - if (!IsMemCpySmall(Len, Alignment)) + if (!isMemCpySmall(Len, Alignment)) return false; int64_t UnscaledOffset = 0; @@ -1464,14 +3203,11 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, } } - bool RV; - unsigned ResultReg; - RV = EmitLoad(VT, ResultReg, Src); - if (!RV) + unsigned ResultReg = emitLoad(VT, VT, Src); + if (!ResultReg) return false; - RV = EmitStore(VT, ResultReg, Dest); - if (!RV) + if (!emitStore(VT, ResultReg, Dest)) return false; int64_t Size = VT.getSizeInBits() / 8; @@ -1486,73 +3222,430 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, return true; } -bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) { - // FIXME: Handle more intrinsics. - switch (I.getIntrinsicID()) { +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, + const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::sadd_with_overflow; + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) + IID = Intrinsic::uadd_with_overflow; + break; + } + + AArch64CC::CondCode TmpCC; + switch (IID) { default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + TmpCC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + TmpCC = AArch64CC::HS; + break; + case Intrinsic::usub_with_overflow: + TmpCC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + TmpCC = AArch64CC::NE; + break; + } + + // Check if both instructions are in the same basic block. + if (!isValueAvailable(II)) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + +bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { + // FIXME: Handle more intrinsics. + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::frameaddress: { + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const AArch64RegisterInfo *RegInfo = + static_cast<const AArch64RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); + unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr); + // Recursively load frame address + // ldr x0, [fp] + // ldr x0, [x0] + // ldr x0, [x0] + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass, + SrcReg, /*IsKill=*/true, 0); + assert(DestReg && "Unexpected LDR instruction emission failure."); + SrcReg = DestReg; + } + + updateValueMap(II, SrcReg); + return true; + } case Intrinsic::memcpy: case Intrinsic::memmove: { - const MemTransferInst &MTI = cast<MemTransferInst>(I); + const auto *MTI = cast<MemTransferInst>(II); // Don't handle volatile. - if (MTI.isVolatile()) + if (MTI->isVolatile()) return false; // Disable inlining for memmove before calls to ComputeAddress. Otherwise, // we would emit dead code because we don't currently handle memmoves. - bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy); - if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) { + bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy); + if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) { // Small memcpy's are common enough that we want to do them without a call // if possible. - uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue(); - unsigned Alignment = MTI.getAlignment(); - if (IsMemCpySmall(Len, Alignment)) { + uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue(); + unsigned Alignment = MTI->getAlignment(); + if (isMemCpySmall(Len, Alignment)) { Address Dest, Src; - if (!ComputeAddress(MTI.getRawDest(), Dest) || - !ComputeAddress(MTI.getRawSource(), Src)) + if (!computeAddress(MTI->getRawDest(), Dest) || + !computeAddress(MTI->getRawSource(), Src)) return false; - if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment)) + if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment)) return true; } } - if (!MTI.getLength()->getType()->isIntegerTy(64)) + if (!MTI->getLength()->getType()->isIntegerTy(64)) return false; - if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255) + if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. return false; - const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove"; - return SelectCall(&I, IntrMemName); + const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove"; + return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2); } case Intrinsic::memset: { - const MemSetInst &MSI = cast<MemSetInst>(I); + const MemSetInst *MSI = cast<MemSetInst>(II); // Don't handle volatile. - if (MSI.isVolatile()) + if (MSI->isVolatile()) return false; - if (!MSI.getLength()->getType()->isIntegerTy(64)) + if (!MSI->getLength()->getType()->isIntegerTy(64)) return false; - if (MSI.getDestAddressSpace() > 255) + if (MSI->getDestAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. return false; - return SelectCall(&I, "memset"); + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); + } + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: { + MVT RetVT; + if (!isTypeLegal(II->getType(), RetVT)) + return false; + + if (RetVT != MVT::f32 && RetVT != MVT::f64) + return false; + + static const RTLIB::Libcall LibCallTable[3][2] = { + { RTLIB::SIN_F32, RTLIB::SIN_F64 }, + { RTLIB::COS_F32, RTLIB::COS_F64 }, + { RTLIB::POW_F32, RTLIB::POW_F64 } + }; + RTLIB::Libcall LC; + bool Is64Bit = RetVT == MVT::f64; + switch (II->getIntrinsicID()) { + default: + llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::sin: + LC = LibCallTable[0][Is64Bit]; + break; + case Intrinsic::cos: + LC = LibCallTable[1][Is64Bit]; + break; + case Intrinsic::pow: + LC = LibCallTable[2][Is64Bit]; + break; + } + + ArgListTy Args; + Args.reserve(II->getNumArgOperands()); + + // Populate the argument list. + for (auto &Arg : II->arg_operands()) { + ArgListEntry Entry; + Entry.Val = Arg; + Entry.Ty = Arg->getType(); + Args.push_back(Entry); + } + + CallLoweringInfo CLI; + CLI.setCallee(TLI.getLibcallCallingConv(LC), II->getType(), + TLI.getLibcallName(LC), std::move(Args)); + if (!lowerCallTo(CLI)) + return false; + updateValueMap(II, CLI.ResultReg); + return true; + } + case Intrinsic::fabs: { + MVT VT; + if (!isTypeLegal(II->getType(), VT)) + return false; + + unsigned Opc; + switch (VT.SimpleTy) { + default: + return false; + case MVT::f32: + Opc = AArch64::FABSSr; + break; + case MVT::f64: + Opc = AArch64::FABSDr; + break; + } + unsigned SrcReg = getRegForValue(II->getOperand(0)); + if (!SrcReg) + return false; + bool SrcRegIsKill = hasTrivialKill(II->getOperand(0)); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg, getKillRegState(SrcRegIsKill)); + updateValueMap(II, ResultReg); + return true; } case Intrinsic::trap: { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) .addImm(1); return true; } + case Intrinsic::sqrt: { + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Op0Reg = getRegForValue(II->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(II->getOperand(0)); + + unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill); + if (!ResultReg) + return false; + + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics. + const Function *Callee = II->getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + // Simplify multiplies. + unsigned IID = II->getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::smul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::sadd_with_overflow; + RHS = LHS; + } + break; + case Intrinsic::umul_with_overflow: + if (const auto *C = dyn_cast<ConstantInt>(RHS)) + if (C->getValue() == 2) { + IID = Intrinsic::uadd_with_overflow; + RHS = LHS; + } + break; + } + + unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0; + AArch64CC::CondCode CC = AArch64CC::Invalid; + switch (IID) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::VS; + break; + case Intrinsic::uadd_with_overflow: + ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::HS; + break; + case Intrinsic::ssub_with_overflow: + ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::VS; + break; + case Intrinsic::usub_with_overflow: + ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true); + CC = AArch64CC::LO; + break; + case Intrinsic::smul_with_overflow: { + CC = AArch64CC::NE; + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + if (VT == MVT::i32) { + MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg, + /*IsKill=*/false, 32); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, + AArch64::sub_32); + ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true, + AArch64::sub_32); + emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, + AArch64_AM::ASR, 31, /*WantResult=*/false); + } else { + assert(VT == MVT::i64 && "Unexpected value type."); + MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false, + AArch64_AM::ASR, 63, /*WantResult=*/false); + } + break; + } + case Intrinsic::umul_with_overflow: { + CC = AArch64CC::NE; + unsigned LHSReg = getRegForValue(LHS); + if (!LHSReg) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + if (!RHSReg) + return false; + bool RHSIsKill = hasTrivialKill(RHS); + + if (VT == MVT::i32) { + MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg, + /*IsKill=*/false, AArch64_AM::LSR, 32, + /*WantResult=*/false); + MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true, + AArch64::sub_32); + } else { + assert(VT == MVT::i64 && "Unexpected value type."); + MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill); + unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg, + /*IsKill=*/false, /*WantResult=*/false); + } + break; + } + } + + if (MulReg) { + ResultReg1 = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg); + } + + ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass, + AArch64::WZR, /*IsKill=*/true, AArch64::WZR, + /*IsKill=*/true, getInvertedCondCode(CC)); + (void)ResultReg2; + assert((ResultReg1 + 1) == ResultReg2 && + "Nonconsecutive result registers."); + updateValueMap(II, ResultReg1, 2); + return true; + } } return false; } -bool AArch64FastISel::SelectRet(const Instruction *I) { +bool AArch64FastISel::selectRet(const Instruction *I) { const ReturnInst *Ret = cast<ReturnInst>(I); const Function &F = *I->getParent()->getParent(); @@ -1572,8 +3665,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, - I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; CCInfo.AnalyzeReturn(Outs, RetCC); @@ -1586,11 +3678,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) { const Value *RV = Ret->getOperand(0); // Don't bother handling odd stuff for now. - if (VA.getLocInfo() != CCValAssign::Full) + if ((VA.getLocInfo() != CCValAssign::Full) && + (VA.getLocInfo() != CCValAssign::BCvt)) return false; + // Only handle register returns for now. if (!VA.isRegLoc()) return false; + unsigned Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -1606,12 +3701,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) { return false; // Vectors (of > 1 lane) in big endian need tricky handling. - if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1) + if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 && + !Subtarget->isLittleEndian()) return false; MVT RVVT = RVEVT.getSimpleVT(); if (RVVT == MVT::f128) return false; + MVT DestVT = VA.getValVT(); // Special handling for extended integers. if (RVVT != DestVT) { @@ -1621,8 +3718,8 @@ bool AArch64FastISel::SelectRet(const Instruction *I) { if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) return false; - bool isZExt = Outs[0].Flags.isZExt(); - SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt); + bool IsZExt = Outs[0].Flags.isZExt(); + SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt); if (SrcReg == 0) return false; } @@ -1642,7 +3739,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) { return true; } -bool AArch64FastISel::SelectTrunc(const Instruction *I) { +bool AArch64FastISel::selectTrunc(const Instruction *I) { Type *DestTy = I->getType(); Value *Op = I->getOperand(0); Type *SrcTy = Op->getType(); @@ -1667,10 +3764,14 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) { unsigned SrcReg = getRegForValue(Op); if (!SrcReg) return false; + bool SrcIsKill = hasTrivialKill(Op); // If we're truncating from i64 to a smaller non-legal type then generate an - // AND. Otherwise, we know the high bits are undefined and a truncate doesn't - // generate any code. + // AND. Otherwise, we know the high bits are undefined and a truncate only + // generate a COPY. We cannot mark the source register also as result + // register, because this can incorrectly transfer the kill flag onto the + // source register. + unsigned ResultReg; if (SrcVT == MVT::i64) { uint64_t Mask = 0; switch (DestVT.SimpleTy) { @@ -1688,23 +3789,23 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) { break; } // Issue an extract_subreg to get the lower 32-bits. - unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true, + unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill, AArch64::sub_32); - MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass); // Create the AND instruction which performs the actual truncation. - unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), - ANDReg) - .addReg(Reg32) - .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32)); - SrcReg = ANDReg; + ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask); + assert(ResultReg && "Unexpected AND instruction emission failure."); + } else { + ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SrcReg, getKillRegState(SrcIsKill)); } - UpdateValueMap(I, SrcReg); + updateValueMap(I, ResultReg); return true; } -unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { +unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) { assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 || DestVT == MVT::i64) && "Unexpected value type."); @@ -1712,14 +3813,9 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { if (DestVT == MVT::i8 || DestVT == MVT::i16) DestVT = MVT::i32; - if (isZExt) { - MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass); - unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), - ResultReg) - .addReg(SrcReg) - .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); - + if (IsZExt) { + unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1); + assert(ResultReg && "Unexpected AND instruction emission failure."); if (DestVT == MVT::i64) { // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. @@ -1737,18 +3833,389 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { // FIXME: We're SExt i1 to i64. return 0; } - unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri), - ResultReg) - .addReg(SrcReg) + return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg, + /*TODO:IsKill=*/false, 0, 0); + } +} + +unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + unsigned Opc, ZReg; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: + case MVT::i16: + case MVT::i32: + RetVT = MVT::i32; + Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break; + case MVT::i64: + Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill, + /*IsKill=*/ZReg, true); +} + +unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + if (RetVT != MVT::i64) + return 0; + + return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass, + Op0, Op0IsKill, Op1, Op1IsKill, + AArch64::XZR, /*IsKill=*/true); +} + +unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + if (RetVT != MVT::i64) + return 0; + + return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass, + Op0, Op0IsKill, Op1, Op1IsKill, + AArch64::XZR, /*IsKill=*/true); +} + +unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::LSLVWr; break; + case MVT::i64: Opc = AArch64::LSLVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<32+s-r,32-r> = Wn<s:0> when r > s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 4 + // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7 + // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext + // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext + // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 8 + // Wd<32+7-24,32-24> = Wn<7:0> + // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext + // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext + // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = shl i16 %1, 12 + // Wd<32+3-20,32-20> = Wn<3:0> + // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext + // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext + + unsigned ImmR = RegSize - Shift; + // Limit the width to the length of the source type. + unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift); + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) .addImm(0) - .addImm(0); - return ResultReg; + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); } -unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, - bool isZExt) { +unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::LSRVWr; break; + case MVT::i64: Opc = AArch64::LSRVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op0IsKill = Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<s-r:0> = Wn<s:r> when r <= s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 4 + // Wd<7-4:0> = Wn<7:4> + // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext + // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 8 + // Wd<7-7,0> = Wn<7:7> + // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = lshr i16 %1, 12 + // Wd<7-7,0> = Wn<7:7> <- clamp r to 7 + // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + if (Shift >= SrcBits && IsZExt) + return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT); + + // It is not possible to fold a sign-extend into the LShr instruction. In this + // case emit a sign-extend. + if (!IsZExt) { + Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt); + if (!Op0) + return 0; + Op0IsKill = true; + SrcVT = RetVT; + SrcBits = SrcVT.getSizeInBits(); + IsZExt = true; + } + + unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift); + unsigned ImmS = SrcBits - 1; + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) + .addImm(0) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; + } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); +} + +unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill, + unsigned Op1Reg, bool Op1IsKill) { + unsigned Opc = 0; + bool NeedTrunc = false; + uint64_t Mask = 0; + switch (RetVT.SimpleTy) { + default: return 0; + case MVT::i8: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff; break; + case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break; + case MVT::i32: Opc = AArch64::ASRVWr; break; + case MVT::i64: Opc = AArch64::ASRVXr; break; + } + + const TargetRegisterClass *RC = + (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + if (NeedTrunc) { + Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false); + Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask); + Op0IsKill = Op1IsKill = true; + } + unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg, + Op1IsKill); + if (NeedTrunc) + ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask); + return ResultReg; +} + +unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0, + bool Op0IsKill, uint64_t Shift, + bool IsZExt) { + assert(RetVT.SimpleTy >= SrcVT.SimpleTy && + "Unexpected source/return type pair."); + assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 || + SrcVT == MVT::i32 || SrcVT == MVT::i64) && + "Unexpected source value type."); + assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 || + RetVT == MVT::i64) && "Unexpected return value type."); + + bool Is64Bit = (RetVT == MVT::i64); + unsigned RegSize = Is64Bit ? 64 : 32; + unsigned DstBits = RetVT.getSizeInBits(); + unsigned SrcBits = SrcVT.getSizeInBits(); + const TargetRegisterClass *RC = + Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + + // Just emit a copy for "zero" shifts. + if (Shift == 0) { + if (RetVT == SrcVT) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(Op0, getKillRegState(Op0IsKill)); + return ResultReg; + } else + return emitIntExt(SrcVT, Op0, RetVT, IsZExt); + } + + // Don't deal with undefined shifts. + if (Shift >= DstBits) + return 0; + + // For immediate shifts we can fold the zero-/sign-extension into the shift. + // {S|U}BFM Wd, Wn, #r, #s + // Wd<s-r:0> = Wn<s:r> when r <= s + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 4 + // Wd<7-4:0> = Wn<7:4> + // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext + // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 8 + // Wd<7-7,0> = Wn<7:7> + // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16 + // %2 = ashr i16 %1, 12 + // Wd<7-7,0> = Wn<7:7> <- clamp r to 7 + // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext + // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext + + if (Shift >= SrcBits && IsZExt) + return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT); + + unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift); + unsigned ImmS = SrcBits - 1; + static const unsigned OpcTable[2][2] = { + {AArch64::SBFMWri, AArch64::SBFMXri}, + {AArch64::UBFMWri, AArch64::UBFMXri} + }; + unsigned Opc = OpcTable[IsZExt][Is64Bit]; + if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) { + unsigned TmpReg = MRI.createVirtualRegister(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), TmpReg) + .addImm(0) + .addReg(Op0, getKillRegState(Op0IsKill)) + .addImm(AArch64::sub_32); + Op0 = TmpReg; + Op0IsKill = true; + } + return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS); +} + +unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + bool IsZExt) { assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); // FastISel does not have plumbing to deal with extensions where the SrcVT or @@ -1768,24 +4235,24 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, default: return 0; case MVT::i1: - return Emiti1Ext(SrcReg, DestVT, isZExt); + return emiti1Ext(SrcReg, DestVT, IsZExt); case MVT::i8: if (DestVT == MVT::i64) - Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; else - Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri; Imm = 7; break; case MVT::i16: if (DestVT == MVT::i64) - Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; else - Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri; Imm = 15; break; case MVT::i32: assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?"); - Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri; Imm = 31; break; } @@ -1803,45 +4270,167 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, SrcReg = Src64; } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg) - .addImm(0) - .addImm(Imm); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm); +} - return ResultReg; +static bool isZExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRBBroX: + case AArch64::LDRHHroX: + case AArch64::LDRWroX: + case AArch64::LDRBBroW: + case AArch64::LDRHHroW: + case AArch64::LDRWroW: + return true; + } } -bool AArch64FastISel::SelectIntExt(const Instruction *I) { - // On ARM, in general, integer casts don't involve legal types; this code - // handles promotable integers. The high bits for a type smaller than - // the register size are assumed to be undefined. - Type *DestTy = I->getType(); - Value *Src = I->getOperand(0); - Type *SrcTy = Src->getType(); +static bool isSExtLoad(const MachineInstr *LI) { + switch (LI->getOpcode()) { + default: + return false; + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRSBWroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSBWroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + return true; + } +} - bool isZExt = isa<ZExtInst>(I); - unsigned SrcReg = getRegForValue(Src); - if (!SrcReg) +bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT, + MVT SrcVT) { + const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)); + if (!LI || !LI->hasOneUse()) return false; - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); - if (!SrcEVT.isSimple()) + // Check if the load instruction has already been selected. + unsigned Reg = lookUpRegForValue(LI); + if (!Reg) return false; - if (!DestEVT.isSimple()) + + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (!MI) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - MVT DestVT = DestEVT.getSimpleVT(); - unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt); - if (ResultReg == 0) + // Check if the correct load instruction has been emitted - SelectionDAG might + // have emitted a zero-extending load, but we need a sign-extending load. + bool IsZExt = isa<ZExtInst>(I); + const auto *LoadMI = MI; + if (LoadMI->getOpcode() == TargetOpcode::COPY && + LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) { + unsigned LoadReg = MI->getOperand(1).getReg(); + LoadMI = MRI.getUniqueVRegDef(LoadReg); + assert(LoadMI && "Expected valid instruction"); + } + if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI))) + return false; + + // Nothing to be done. + if (RetVT != MVT::i64 || SrcVT > MVT::i32) { + updateValueMap(I, Reg); + return true; + } + + if (IsZExt) { + unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(Reg, getKillRegState(true)) + .addImm(AArch64::sub_32); + Reg = Reg64; + } else { + assert((MI->getOpcode() == TargetOpcode::COPY && + MI->getOperand(1).getSubReg() == AArch64::sub_32) && + "Expected copy instruction"); + Reg = MI->getOperand(1).getReg(); + MI->eraseFromParent(); + } + updateValueMap(I, Reg); + return true; +} + +bool AArch64FastISel::selectIntExt(const Instruction *I) { + assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) && + "Unexpected integer extend instruction."); + MVT RetVT; + MVT SrcVT; + if (!isTypeSupported(I->getType(), RetVT)) + return false; + + if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT)) + return false; + + // Try to optimize already sign-/zero-extended values from load instructions. + if (optimizeIntExtLoad(I, RetVT, SrcVT)) + return true; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (!SrcReg) + return false; + bool SrcIsKill = hasTrivialKill(I->getOperand(0)); + + // Try to optimize already sign-/zero-extended values from function arguments. + bool IsZExt = isa<ZExtInst>(I); + if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) { + if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) { + if (RetVT == MVT::i64 && SrcVT != MVT::i64) { + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), ResultReg) + .addImm(0) + .addReg(SrcReg, getKillRegState(SrcIsKill)) + .addImm(AArch64::sub_32); + SrcReg = ResultReg; + } + // Conservatively clear all kill flags from all uses, because we are + // replacing a sign-/zero-extend instruction at IR level with a nop at MI + // level. The result of the instruction at IR level might have been + // trivially dead, which is now not longer true. + unsigned UseReg = lookUpRegForValue(I); + if (UseReg) + MRI.clearKillFlags(UseReg); + + updateValueMap(I, SrcReg); + return true; + } + } + + unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt); + if (!ResultReg) return false; - UpdateValueMap(I, ResultReg); + + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) { +bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) { EVT DestEVT = TLI.getValueType(I->getType(), true); if (!DestEVT.isSimple()) return false; @@ -1851,144 +4440,529 @@ bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) { return false; unsigned DivOpc; - bool is64bit = (DestVT == MVT::i64); + bool Is64bit = (DestVT == MVT::i64); switch (ISDOpcode) { default: return false; case ISD::SREM: - DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr; + DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr; break; case ISD::UREM: - DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr; + DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr; break; } - unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr; + unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr; unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); unsigned Src1Reg = getRegForValue(I->getOperand(1)); if (!Src1Reg) return false; + bool Src1IsKill = hasTrivialKill(I->getOperand(1)); - unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg) - .addReg(Src0Reg) - .addReg(Src1Reg); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false, + Src1Reg, /*IsKill=*/false); + assert(QuotReg && "Unexpected DIV instruction emission failure."); // The remainder is computed as numerator - (quotient * denominator) using the // MSUB instruction. - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg) - .addReg(QuotReg) - .addReg(Src1Reg) - .addReg(Src0Reg); - UpdateValueMap(I, ResultReg); + unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true, + Src1Reg, Src1IsKill, Src0Reg, + Src0IsKill); + updateValueMap(I, ResultReg); return true; } -bool AArch64FastISel::SelectMul(const Instruction *I) { - EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true); - if (!SrcEVT.isSimple()) +bool AArch64FastISel::selectMul(const Instruction *I) { + MVT VT; + if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true)) return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - // Must be simple value type. Don't handle vectors. - if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && - SrcVT != MVT::i8) + if (VT.isVector()) + return selectBinaryOp(I, ISD::MUL); + + const Value *Src0 = I->getOperand(0); + const Value *Src1 = I->getOperand(1); + if (const auto *C = dyn_cast<ConstantInt>(Src0)) + if (C->getValue().isPowerOf2()) + std::swap(Src0, Src1); + + // Try to simplify to a shift instruction. + if (const auto *C = dyn_cast<ConstantInt>(Src1)) + if (C->getValue().isPowerOf2()) { + uint64_t ShiftVal = C->getValue().logBase2(); + MVT SrcVT = VT; + bool IsZExt = true; + if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) { + if (!isIntExtFree(ZExt)) { + MVT VT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = true; + Src0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) { + if (!isIntExtFree(SExt)) { + MVT VT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) { + SrcVT = VT; + IsZExt = false; + Src0 = SExt->getOperand(0); + } + } + } + + unsigned Src0Reg = getRegForValue(Src0); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(Src0); + + unsigned ResultReg = + emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt); + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + } + + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); + + unsigned Src1Reg = getRegForValue(I->getOperand(1)); + if (!Src1Reg) + return false; + bool Src1IsKill = hasTrivialKill(I->getOperand(1)); + + unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectShift(const Instruction *I) { + MVT RetVT; + if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true)) + return false; + + if (RetVT.isVector()) + return selectOperator(I, I->getOpcode()); + + if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) { + unsigned ResultReg = 0; + uint64_t ShiftVal = C->getZExtValue(); + MVT SrcVT = RetVT; + bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true; + const Value *Op0 = I->getOperand(0); + if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) { + if (!isIntExtFree(ZExt)) { + MVT TmpVT; + if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = true; + Op0 = ZExt->getOperand(0); + } + } + } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) { + if (!isIntExtFree(SExt)) { + MVT TmpVT; + if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) { + SrcVT = TmpVT; + IsZExt = false; + Op0 = SExt->getOperand(0); + } + } + } + + unsigned Op0Reg = getRegForValue(Op0); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(Op0); + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected instruction."); + case Instruction::Shl: + ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + case Instruction::AShr: + ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + case Instruction::LShr: + ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt); + break; + } + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; + } + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(I->getOperand(0)); + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (!Op1Reg) + return false; + bool Op1IsKill = hasTrivialKill(I->getOperand(1)); + + unsigned ResultReg = 0; + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected instruction."); + case Instruction::Shl: + ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + case Instruction::AShr: + ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + case Instruction::LShr: + ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill); + break; + } + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectBitCast(const Instruction *I) { + MVT RetVT, SrcVT; + + if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT)) + return false; + if (!isTypeLegal(I->getType(), RetVT)) return false; unsigned Opc; - unsigned ZReg; - switch (SrcVT.SimpleTy) { + if (RetVT == MVT::f32 && SrcVT == MVT::i32) + Opc = AArch64::FMOVWSr; + else if (RetVT == MVT::f64 && SrcVT == MVT::i64) + Opc = AArch64::FMOVXDr; + else if (RetVT == MVT::i32 && SrcVT == MVT::f32) + Opc = AArch64::FMOVSWr; + else if (RetVT == MVT::i64 && SrcVT == MVT::f64) + Opc = AArch64::FMOVDXr; + else + return false; + + const TargetRegisterClass *RC = nullptr; + switch (RetVT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: RC = &AArch64::GPR32RegClass; break; + case MVT::i64: RC = &AArch64::GPR64RegClass; break; + case MVT::f32: RC = &AArch64::FPR32RegClass; break; + case MVT::f64: RC = &AArch64::FPR64RegClass; break; + } + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (!Op0Reg) + return false; + bool Op0IsKill = hasTrivialKill(I->getOperand(0)); + unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::selectFRem(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + RTLIB::Libcall LC; + switch (RetVT.SimpleTy) { default: return false; - case MVT::i8: - case MVT::i16: - case MVT::i32: - ZReg = AArch64::WZR; - Opc = AArch64::MADDWrrr; - SrcVT = MVT::i32; + case MVT::f32: + LC = RTLIB::REM_F32; break; - case MVT::i64: - ZReg = AArch64::XZR; - Opc = AArch64::MADDXrrr; + case MVT::f64: + LC = RTLIB::REM_F64; break; } + ArgListTy Args; + Args.reserve(I->getNumOperands()); + + // Populate the argument list. + for (auto &Arg : I->operands()) { + ArgListEntry Entry; + Entry.Val = Arg; + Entry.Ty = Arg->getType(); + Args.push_back(Entry); + } + + CallLoweringInfo CLI; + CLI.setCallee(TLI.getLibcallCallingConv(LC), I->getType(), + TLI.getLibcallName(LC), std::move(Args)); + if (!lowerCallTo(CLI)) + return false; + updateValueMap(I, CLI.ResultReg); + return true; +} + +bool AArch64FastISel::selectSDiv(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + if (!isa<ConstantInt>(I->getOperand(1))) + return selectBinaryOp(I, ISD::SDIV); + + const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue(); + if ((VT != MVT::i32 && VT != MVT::i64) || !C || + !(C.isPowerOf2() || (-C).isPowerOf2())) + return selectBinaryOp(I, ISD::SDIV); + + unsigned Lg2 = C.countTrailingZeros(); unsigned Src0Reg = getRegForValue(I->getOperand(0)); if (!Src0Reg) return false; + bool Src0IsKill = hasTrivialKill(I->getOperand(0)); - unsigned Src1Reg = getRegForValue(I->getOperand(1)); - if (!Src1Reg) + if (cast<BinaryOperator>(I)->isExact()) { + unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2); + if (!ResultReg) + return false; + updateValueMap(I, ResultReg); + return true; + } + + int64_t Pow2MinusOne = (1ULL << Lg2) - 1; + unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne); + if (!AddReg) return false; - // Create the base instruction, then add the operands. - unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Src0Reg) - .addReg(Src1Reg) - .addReg(ZReg); - UpdateValueMap(I, ResultReg); + // (Src0 < 0) ? Pow2 - 1 : 0; + if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0)) + return false; + + unsigned SelectOpc; + const TargetRegisterClass *RC; + if (VT == MVT::i64) { + SelectOpc = AArch64::CSELXr; + RC = &AArch64::GPR64RegClass; + } else { + SelectOpc = AArch64::CSELWr; + RC = &AArch64::GPR32RegClass; + } + unsigned SelectReg = + fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg, + Src0IsKill, AArch64CC::LT); + if (!SelectReg) + return false; + + // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also + // negate the result. + unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR; + unsigned ResultReg; + if (C.isNegative()) + ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true, + SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2); + else + ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2); + + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We +/// have to duplicate it for AArch64, because otherwise we would fail during the +/// sign-extend emission. +std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) { + unsigned IdxN = getRegForValue(Idx); + if (IdxN == 0) + // Unhandled operand. Halt "fast" selection and bail. + return std::pair<unsigned, bool>(0, false); + + bool IdxNIsKill = hasTrivialKill(Idx); + + // If the index is smaller or larger than intptr_t, truncate or extend it. + MVT PtrVT = TLI.getPointerTy(); + EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false); + if (IdxVT.bitsLT(PtrVT)) { + IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false); + IdxNIsKill = true; + } else if (IdxVT.bitsGT(PtrVT)) + llvm_unreachable("AArch64 FastISel doesn't support types larger than i64"); + return std::pair<unsigned, bool>(IdxN, IdxNIsKill); +} + +/// This is mostly a copy of the existing FastISel GEP code, but we have to +/// duplicate it for AArch64, because otherwise we would bail out even for +/// simple cases. This is because the standard fastEmit functions don't cover +/// MUL at all and ADD is lowered very inefficientily. +bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { + unsigned N = getRegForValue(I->getOperand(0)); + if (!N) + return false; + bool NIsKill = hasTrivialKill(I->getOperand(0)); + + // Keep a running tab of the total offset to coalesce multiple N = N + Offset + // into a single N = N + TotalOffset. + uint64_t TotalOffs = 0; + Type *Ty = I->getOperand(0)->getType(); + MVT VT = TLI.getPointerTy(); + for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) { + const Value *Idx = *OI; + if (auto *StTy = dyn_cast<StructType>(Ty)) { + unsigned Field = cast<ConstantInt>(Idx)->getZExtValue(); + // N = N + Offset + if (Field) + TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); + Ty = StTy->getElementType(Field); + } else { + Ty = cast<SequentialType>(Ty)->getElementType(); + // If this is a constant subscript, handle it quickly. + if (const auto *CI = dyn_cast<ConstantInt>(Idx)) { + if (CI->isZero()) + continue; + // N = N + Offset + TotalOffs += + DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue(); + continue; + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + NIsKill = true; + TotalOffs = 0; + } + + // N = N + Idx * ElementSize; + uint64_t ElementSize = DL.getTypeAllocSize(Ty); + std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx); + unsigned IdxN = Pair.first; + bool IdxNIsKill = Pair.second; + if (!IdxN) + return false; + + if (ElementSize != 1) { + unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize); + if (!C) + return false; + IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true); + if (!IdxN) + return false; + IdxNIsKill = true; + } + N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill); + if (!N) + return false; + } + } + if (TotalOffs) { + N = emitAdd_ri_(VT, N, NIsKill, TotalOffs); + if (!N) + return false; + } + updateValueMap(I, N); return true; } -bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) { +bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; - case Instruction::Load: - return SelectLoad(I); - case Instruction::Store: - return SelectStore(I); + case Instruction::Add: + case Instruction::Sub: + return selectAddSub(I); + case Instruction::Mul: + return selectMul(I); + case Instruction::SDiv: + return selectSDiv(I); + case Instruction::SRem: + if (!selectBinaryOp(I, ISD::SREM)) + return selectRem(I, ISD::SREM); + return true; + case Instruction::URem: + if (!selectBinaryOp(I, ISD::UREM)) + return selectRem(I, ISD::UREM); + return true; + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return selectShift(I); + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return selectLogicalOp(I); case Instruction::Br: - return SelectBranch(I); + return selectBranch(I); case Instruction::IndirectBr: - return SelectIndirectBr(I); - case Instruction::FCmp: - case Instruction::ICmp: - return SelectCmp(I); - case Instruction::Select: - return SelectSelect(I); - case Instruction::FPExt: - return SelectFPExt(I); - case Instruction::FPTrunc: - return SelectFPTrunc(I); + return selectIndirectBr(I); + case Instruction::BitCast: + if (!FastISel::selectBitCast(I)) + return selectBitCast(I); + return true; case Instruction::FPToSI: - return SelectFPToInt(I, /*Signed=*/true); + if (!selectCast(I, ISD::FP_TO_SINT)) + return selectFPToInt(I, /*Signed=*/true); + return true; case Instruction::FPToUI: - return SelectFPToInt(I, /*Signed=*/false); + return selectFPToInt(I, /*Signed=*/false); + case Instruction::ZExt: + case Instruction::SExt: + return selectIntExt(I); + case Instruction::Trunc: + if (!selectCast(I, ISD::TRUNCATE)) + return selectTrunc(I); + return true; + case Instruction::FPExt: + return selectFPExt(I); + case Instruction::FPTrunc: + return selectFPTrunc(I); case Instruction::SIToFP: - return SelectIntToFP(I, /*Signed=*/true); + if (!selectCast(I, ISD::SINT_TO_FP)) + return selectIntToFP(I, /*Signed=*/true); + return true; case Instruction::UIToFP: - return SelectIntToFP(I, /*Signed=*/false); - case Instruction::SRem: - return SelectRem(I, ISD::SREM); - case Instruction::URem: - return SelectRem(I, ISD::UREM); - case Instruction::Call: - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) - return SelectIntrinsicCall(*II); - return SelectCall(I); + return selectIntToFP(I, /*Signed=*/false); + case Instruction::Load: + return selectLoad(I); + case Instruction::Store: + return selectStore(I); + case Instruction::FCmp: + case Instruction::ICmp: + return selectCmp(I); + case Instruction::Select: + return selectSelect(I); case Instruction::Ret: - return SelectRet(I); - case Instruction::Trunc: - return SelectTrunc(I); - case Instruction::ZExt: - case Instruction::SExt: - return SelectIntExt(I); - case Instruction::Mul: - // FIXME: This really should be handled by the target-independent selector. - return SelectMul(I); + return selectRet(I); + case Instruction::FRem: + return selectFRem(I); + case Instruction::GetElementPtr: + return selectGetElementPtr(I); } - return false; + + // fall-back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; } namespace llvm { -llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) { - return new AArch64FastISel(funcInfo, libInfo); +llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + return new AArch64FastISel(FuncInfo, LibInfo); } } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 9c33717..66aa216 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -17,16 +17,16 @@ #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "llvm/ADT/Statistic.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -86,13 +86,14 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); #ifndef NDEBUG - const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); assert(!RegInfo->needsStackRealignment(MF) && "No stack realignment on AArch64!"); #endif return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken()); + MFI->isFrameAddressTaken() || MFI->hasStackMap() || + MFI->hasPatchPoint()); } /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is @@ -109,13 +110,13 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo()); + static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); DebugLoc DL = I->getDebugLoc(); int Opc = I->getOpcode(); bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); if (!TFI->hasReservedCallFrame(MF)) { unsigned Align = getStackAlignment(); @@ -131,7 +132,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr( // FIXME: in-function stack adjustment for calls is limited to 24-bits // because there's no guaranteed temporary register available. // - // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable. + // ADD/SUB (immediate) has only LSL #0 and LSL #12 available. // 1) For offset <= 12-bit, we use LSL #0 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses // LSL #0, and the other uses LSL #12. @@ -158,7 +159,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); DebugLoc DL = MBB.findDebugLoc(MBBI); // Add callee saved registers to move list. @@ -166,7 +167,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout *TD = MF.getTarget().getDataLayout(); + const DataLayout *TD = MF.getSubtarget().getDataLayout(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. @@ -195,7 +196,8 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, DwarfReg, Offset - TotalSkipped)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -205,8 +207,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getTarget().getRegisterInfo()); - const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); + MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); @@ -233,7 +235,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } else if (NumBytes) { ++NumRedZoneFunctions; } @@ -300,7 +303,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false); if (needsFrameMoves) { - const DataLayout *TD = MF.getTarget().getDataLayout(); + const DataLayout *TD = MF.getSubtarget().getDataLayout(); const int StackGrowth = -TD->getPointerSize(0); unsigned FramePtr = RegInfo->getFrameRegister(MF); @@ -376,26 +379,30 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); // Record the location of the stored LR unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); // Record the location of the stored FP CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } // Now emit the moves for whatever callee saved regs we have. @@ -435,9 +442,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo()); + static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo()); DebugLoc DL = MBBI->getDebugLoc(); unsigned RetOpcode = MBBI->getOpcode(); @@ -548,7 +555,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, bool PreferFP) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo()); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); int FPOffset = MFI->getObjectOffset(FI) + 16; int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); @@ -617,7 +624,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); unsigned Count = CSI.size(); DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); @@ -693,7 +700,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); unsigned Count = CSI.size(); DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); @@ -761,7 +768,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( MachineFunction &MF, RegScavenger *RS) const { const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo()); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); MachineRegisterInfo *MRI = &MF.getRegInfo(); SmallVector<unsigned, 4> UnspilledCSGPRs; diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7686e6f..df3875f 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64_FRAMELOWERING_H -#define AArch64_FRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #include "llvm/Target/TargetFrameLowering.h" diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 3f49fab..bb2e1e2 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -303,7 +303,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { /// \brief Determine wether it is worth to fold V into an extended register. bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { - // it hurts if the a value is used at least twice, unless we are optimizing + // it hurts if the value is used at least twice, unless we are optimizing // for code size. if (ForCodeSize || V.hasOneUse()) return true; @@ -569,6 +569,27 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, return isWorthFolding(N); } +/// If there's a use of this ADDlow that's not itself a load/store then we'll +/// need to create a real ADD instruction from it anyway and there's no point in +/// folding it into the mem op. Theoretically, it shouldn't matter, but there's +/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding +/// leads to duplaicated ADRP instructions. +static bool isWorthFoldingADDlow(SDValue N) { + for (auto Use : N->uses()) { + if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && + Use->getOpcode() != ISD::ATOMIC_LOAD && + Use->getOpcode() != ISD::ATOMIC_STORE) + return false; + + // ldar and stlr have much more restrictive addressing modes (just a + // register). + if (cast<MemSDNode>(Use)->getOrdering() > Monotonic) + return false; + } + + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. @@ -582,7 +603,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, return true; } - if (N.getOpcode() == AArch64ISD::ADDlow) { + if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) { GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode()); Base = N.getOperand(0); @@ -594,7 +615,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, unsigned Alignment = GV->getAlignment(); const DataLayout *DL = TLI->getDataLayout(); Type *Ty = GV->getType()->getElementType(); - if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin()) + if (Alignment == 0 && Ty->isSized()) Alignment = DL->getABITypeAlignment(Ty); if (Alignment >= Size) @@ -777,6 +798,21 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, return false; } +// Check if the given immediate is preferred by ADD. If an immediate can be +// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be +// encoded by one MOVZ, return true. +static bool isPreferredADD(int64_t ImmOff) { + // Constant in [0x0, 0xfff] can be encoded in ADD. + if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL) + return true; + // Check if it can be encoded in an "ADD LSL #12". + if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL) + // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant. + return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL && + (ImmOff & 0xffffffffffff0fffLL) != 0x0LL; + return false; +} + bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, SDValue &Offset, SDValue &SignExtend, @@ -786,11 +822,6 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, SDValue LHS = N.getOperand(0); SDValue RHS = N.getOperand(1); - // We don't want to match immediate adds here, because they are better lowered - // to the register-immediate addressing modes. - if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS)) - return false; - // Check if this particular node is reused in any non-memory related // operation. If yes, do not try to fold this node into the address // computation, since the computation will be kept. @@ -800,6 +831,36 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, return false; } + // Watch out if RHS is a wide immediate, it can not be selected into + // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into + // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate + // instructions like: + // MOV X0, WideImmediate + // ADD X1, BaseReg, X0 + // LDR X2, [X1, 0] + // For such situation, using [BaseReg, XReg] addressing mode can save one + // ADD/SUB: + // MOV X0, WideImmediate + // LDR X2, [BaseReg, X0] + if (isa<ConstantSDNode>(RHS)) { + int64_t ImmOff = (int64_t)dyn_cast<ConstantSDNode>(RHS)->getZExtValue(); + unsigned Scale = Log2_32(Size); + // Skip the immediate can be seleced by load/store addressing mode. + // Also skip the immediate can be encoded by a single ADD (SUB is also + // checked by using -ImmOff). + if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) || + isPreferredADD(ImmOff) || isPreferredADD(-ImmOff)) + return false; + + SDLoc DL(N.getNode()); + SDValue Ops[] = { RHS }; + SDNode *MOVI = + CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops); + SDValue MOVIV = SDValue(MOVI, 0); + // This ADD of two X register will be selected into [Reg+Reg] mode. + N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV); + } + // Remember if it is worth folding N when it produces extended register. bool IsExtendedRegisterWorthFolding = isWorthFolding(N); @@ -1381,20 +1442,21 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, return true; } -static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, - unsigned &LSB, unsigned &MSB) { - // We are looking for the following pattern which basically extracts a single - // bit from the source value and places it in the LSB of the destination - // value, all other bits of the destination value or set to zero: +static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &LSB, + unsigned &MSB) { + // We are looking for the following pattern which basically extracts several + // continuous bits from the source value and places it from the LSB of the + // destination value, all other bits of the destination value or set to zero: // // Value2 = AND Value, MaskImm // SRL Value2, ShiftImm // - // with MaskImm >> ShiftImm == 1. + // with MaskImm >> ShiftImm to search for the bit width. // // This gets selected into a single UBFM: // - // UBFM Value, ShiftImm, ShiftImm + // UBFM Value, ShiftImm, BitWide + Srl_imm -1 // if (N->getOpcode() != ISD::SRL) @@ -1410,15 +1472,16 @@ static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, if (!isIntImmediate(N->getOperand(1), Srl_imm)) return false; - // Check whether we really have a one bit extract here. - if (And_mask >> Srl_imm == 0x1) { + // Check whether we really have several bits extract here. + unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm)); + if (BitWide && isMask_64(And_mask >> Srl_imm)) { if (N->getValueType(0) == MVT::i32) Opc = AArch64::UBFMWri; else Opc = AArch64::UBFMXri; - LSB = MSB = Srl_imm; - + LSB = Srl_imm; + MSB = BitWide + Srl_imm - 1; return true; } @@ -1439,8 +1502,8 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, assert((VT == MVT::i32 || VT == MVT::i64) && "Type checking must have been done before calling this function"); - // Check for AND + SRL doing a one bit extract. - if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB)) + // Check for AND + SRL doing several bits extract. + if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB)) return true; // we're looking for a shift of a shift @@ -2116,7 +2179,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case 32: SubReg = AArch64::ssub; break; - case 16: // FALLTHROUGH + case 16: + SubReg = AArch64::hsub; + break; case 8: llvm_unreachable("unexpected zext-requiring extract element!"); } @@ -2204,9 +2269,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); @@ -2222,9 +2287,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); @@ -2240,9 +2305,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); @@ -2258,9 +2323,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); @@ -2276,9 +2341,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); @@ -2294,9 +2359,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); @@ -2312,9 +2377,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); @@ -2330,9 +2395,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); @@ -2348,9 +2413,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); @@ -2364,7 +2429,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_ld2lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 2, AArch64::LD2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 2, AArch64::LD2i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2376,7 +2442,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_ld3lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 3, AArch64::LD3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 3, AArch64::LD3i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2388,7 +2455,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_ld4lane: if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectLoadLane(Node, 4, AArch64::LD4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectLoadLane(Node, 4, AArch64::LD4i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2448,9 +2516,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 2, AArch64::ST1Twov8b); else if (VT == MVT::v16i8) return SelectStore(Node, 2, AArch64::ST1Twov16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 2, AArch64::ST1Twov4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 2, AArch64::ST1Twov8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 2, AArch64::ST1Twov2s); @@ -2467,9 +2535,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 3, AArch64::ST1Threev8b); else if (VT == MVT::v16i8) return SelectStore(Node, 3, AArch64::ST1Threev16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 3, AArch64::ST1Threev4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 3, AArch64::ST1Threev8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 3, AArch64::ST1Threev2s); @@ -2486,9 +2554,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 4, AArch64::ST1Fourv8b); else if (VT == MVT::v16i8) return SelectStore(Node, 4, AArch64::ST1Fourv16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 4, AArch64::ST1Fourv4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 4, AArch64::ST1Fourv8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 4, AArch64::ST1Fourv2s); @@ -2505,9 +2573,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 2, AArch64::ST2Twov8b); else if (VT == MVT::v16i8) return SelectStore(Node, 2, AArch64::ST2Twov16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 2, AArch64::ST2Twov4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 2, AArch64::ST2Twov8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 2, AArch64::ST2Twov2s); @@ -2524,9 +2592,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 3, AArch64::ST3Threev8b); else if (VT == MVT::v16i8) return SelectStore(Node, 3, AArch64::ST3Threev16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 3, AArch64::ST3Threev4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 3, AArch64::ST3Threev8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 3, AArch64::ST3Threev2s); @@ -2543,9 +2611,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectStore(Node, 4, AArch64::ST4Fourv8b); else if (VT == MVT::v16i8) return SelectStore(Node, 4, AArch64::ST4Fourv16b); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectStore(Node, 4, AArch64::ST4Fourv4h); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectStore(Node, 4, AArch64::ST4Fourv8h); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectStore(Node, 4, AArch64::ST4Fourv2s); @@ -2560,7 +2628,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_st2lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 2, AArch64::ST2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 2, AArch64::ST2i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2573,7 +2642,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_st3lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 3, AArch64::ST3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 3, AArch64::ST3i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2586,7 +2656,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::aarch64_neon_st4lane: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectStoreLane(Node, 4, AArch64::ST4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectStoreLane(Node, 4, AArch64::ST4i16); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2603,9 +2674,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); @@ -2622,9 +2693,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); @@ -2641,9 +2712,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); @@ -2660,9 +2731,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); @@ -2679,9 +2750,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); @@ -2698,9 +2769,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); @@ -2717,9 +2788,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); @@ -2736,9 +2807,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); @@ -2755,9 +2826,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); @@ -2774,9 +2845,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); else if (VT == MVT::v16i8) return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); @@ -2791,7 +2862,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case AArch64ISD::LD1LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2804,7 +2876,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case AArch64ISD::LD2LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2817,7 +2890,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case AArch64ISD::LD3LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2830,7 +2904,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case AArch64ISD::LD4LANEpost: { if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2846,9 +2921,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); @@ -2866,9 +2941,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); @@ -2886,9 +2961,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); @@ -2906,9 +2981,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); @@ -2926,9 +3001,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); @@ -2946,9 +3021,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); else if (VT == MVT::v16i8) return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); - else if (VT == MVT::v4i16) + else if (VT == MVT::v4i16 || VT == MVT::v4f16) return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); - else if (VT == MVT::v8i16) + else if (VT == MVT::v8i16 || VT == MVT::v8f16) return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); else if (VT == MVT::v2i32 || VT == MVT::v2f32) return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); @@ -2964,7 +3039,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2978,7 +3054,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) @@ -2992,7 +3069,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { VT = Node->getOperand(1).getValueType(); if (VT == MVT::v16i8 || VT == MVT::v8i8) return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) + else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v8f16) return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || VT == MVT::v2f32) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index f2004ea..0d44f99 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12,9 +12,10 @@ //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" +#include "AArch64CallingConvention.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" -#include "AArch64MachineFunctionInfo.h" #include "AArch64TargetMachine.h" #include "AArch64TargetObjectFile.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -38,10 +39,12 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); +namespace { enum AlignMode { StrictAlign, NoStrictAlign }; +} static cl::opt<AlignMode> Align(cl::desc("Load/store alignment support"), @@ -64,18 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); -//===----------------------------------------------------------------------===// -// AArch64 Lowering public interface. -//===----------------------------------------------------------------------===// -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - if (TT.isOSBinFormatMachO()) - return new AArch64_MachoTargetObjectFile(); - - return new AArch64_ELFTargetObjectFile(); -} -AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { +AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) + : TargetLowering(TM) { Subtarget = &TM.getSubtarget<AArch64Subtarget>(); // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so @@ -106,6 +100,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); + addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); @@ -113,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); + addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes @@ -278,6 +274,94 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // f16 is storage-only, so we promote operations to f32 if we know this is + // valid, and ignore them otherwise. The operations not mentioned here will + // fail to select, but this is not a major problem as no source language + // should be emitting native f16 operations yet. + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + + // v4f16 is also a storage-only type, so promote it to v4f32 when that is + // known to be safe. + setOperationAction(ISD::FADD, MVT::v4f16, Promote); + setOperationAction(ISD::FSUB, MVT::v4f16, Promote); + setOperationAction(ISD::FMUL, MVT::v4f16, Promote); + setOperationAction(ISD::FDIV, MVT::v4f16, Promote); + setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); + AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); + AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); + + // Expand all other v4f16 operations. + // FIXME: We could generate better code by promoting some operations to + // a pair of v4f32s + setOperationAction(ISD::FABS, MVT::v4f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); + setOperationAction(ISD::FCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); + setOperationAction(ISD::FMA, MVT::v4f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); + setOperationAction(ISD::FNEG, MVT::v4f16, Expand); + setOperationAction(ISD::FPOW, MVT::v4f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); + setOperationAction(ISD::FREM, MVT::v4f16, Expand); + setOperationAction(ISD::FROUND, MVT::v4f16, Expand); + setOperationAction(ISD::FRINT, MVT::v4f16, Expand); + setOperationAction(ISD::FSIN, MVT::v4f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); + setOperationAction(ISD::SETCC, MVT::v4f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT, MVT::v4f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP, MVT::v4f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); + + + // v8f16 is also a storage-only type, so expand it. + setOperationAction(ISD::FABS, MVT::v8f16, Expand); + setOperationAction(ISD::FADD, MVT::v8f16, Expand); + setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); + setOperationAction(ISD::FCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FDIV, MVT::v8f16, Expand); + setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); + setOperationAction(ISD::FMA, MVT::v8f16, Expand); + setOperationAction(ISD::FMUL, MVT::v8f16, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); + setOperationAction(ISD::FNEG, MVT::v8f16, Expand); + setOperationAction(ISD::FPOW, MVT::v8f16, Expand); + setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); + setOperationAction(ISD::FREM, MVT::v8f16, Expand); + setOperationAction(ISD::FROUND, MVT::v8f16, Expand); + setOperationAction(ISD::FRINT, MVT::v8f16, Expand); + setOperationAction(ISD::FSIN, MVT::v8f16, Expand); + setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); + setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); + setOperationAction(ISD::FSUB, MVT::v8f16, Expand); + setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); + setOperationAction(ISD::SETCC, MVT::v8f16, Expand); + setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT, MVT::v8f16, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP, MVT::v8f16, Expand); + setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); + setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); + // AArch64 has implementations of a lot of rounding-like FP operations. static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { @@ -303,13 +387,24 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } + // Make floating-point constants legal for the large code model, so they don't + // become loads from the constant pool. + if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + } + // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + for (MVT VT : MVT::fp_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + } + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); @@ -439,30 +534,31 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); + // Custom handling for some quad-vector types to detect MULL. + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Expand); - - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } } // AArch64 has implementations of a lot of rounding-like FP operations. @@ -477,16 +573,20 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM) setOperationAction(ISD::FROUND, Ty, Legal); } } + + // Prefer likely predicted branches to selects on out-of-order cores. + if (Subtarget->isCortexA57()) + PredictableSelectIsExpensive = true; } void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { - if (VT == MVT::v2f32) { + if (VT == MVT::v2f32 || VT == MVT::v4f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); - } else if (VT == MVT::v2f64 || VT == MVT::v4f32) { + } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); @@ -523,7 +623,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); - setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); + for (MVT InnerVT : MVT::all_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) @@ -727,6 +828,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; + case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; @@ -756,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; + case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; + case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; } } @@ -774,7 +878,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const TargetInstrInfo *TII = + getTargetMachine().getSubtargetImpl()->getInstrInfo(); MachineFunction *MF = MBB->getParent(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); @@ -1020,6 +1125,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1051,9 +1158,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, break; case ISD::SETLE: case ISD::SETGT: - if ((VT == MVT::i32 && C != 0x7fffffff && + if ((VT == MVT::i32 && C != INT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || - (VT == MVT::i64 && C != 0x7ffffffffffffffULL && + (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; @@ -1062,9 +1169,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, break; case ISD::SETULE: case ISD::SETUGT: - if ((VT == MVT::i32 && C != 0xffffffff && + if ((VT == MVT::i32 && C != UINT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || - (VT == MVT::i64 && C != 0xfffffffffffffffULL && + (VT == MVT::i64 && C != UINT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; @@ -1074,9 +1181,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the -1 + // constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure + // both the LHS and RHS are truely zero extended and to make sure the + // transformation is profitable. + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { + if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && + isa<LoadSDNode>(LHS)) { + if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, + DAG.getConstant(ValueofRHS, RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); + AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); + return Cmp; + } + } + } + } + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); return Cmp; } @@ -1333,8 +1476,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - // The data thing is not used. - // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); bool IsStream = !Locality; // When the locality number is set @@ -1349,6 +1491,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { // built the mask value encoding the expected behavior. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit + (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), @@ -1400,7 +1543,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); - SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); + MVT ExtVT = + MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } @@ -1505,7 +1651,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); - StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); @@ -1529,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 0); } +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + +static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, + const EVT &OrigTy, + const EVT &ExtTy, + unsigned ExtOpcode) { + // The vector originally had a size of OrigTy. It was then extended to ExtTy. + // We expect the ExtTy to be 128-bits total. If the OrigTy is less than + // 64-bits we need to insert a new extension so that it will be 64-bits. + assert(ExtTy.is128BitVector() && "Unexpected extension size"); + if (OrigTy.getSizeInBits() >= 64) + return N; + + // Must extend size to at least 64 bits to be used as an operand for VMULL. + EVT NewVT = getExtensionTo64Bits(OrigTy); + + return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); +} + +static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, + bool isSigned) { + EVT VT = N->getValueType(0); + + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDNode *Elt = N->getOperand(i).getNode(); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + unsigned HalfSize = EltSize / 2; + if (isSigned) { + if (!isIntN(HalfSize, C->getSExtValue())) + return false; + } else { + if (!isUIntN(HalfSize, C->getZExtValue())) + return false; + } + continue; + } + return false; + } + + return true; +} + +static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) + return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, + N->getOperand(0)->getValueType(0), + N->getValueType(0), + N->getOpcode()); + + assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); + EVT VT = N->getValueType(0); + unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; + unsigned NumElts = VT.getVectorNumElements(); + MVT TruncVT = MVT::getIntegerVT(EltSize); + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i != NumElts; ++i) { + ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); + const APInt &CInt = C->getAPIntValue(); + // Element types smaller than 32 bits are not legal, so use i32 elements. + // The values are implicitly truncated so sext vs. zext doesn't matter. + Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); + } + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), + MVT::getVectorVT(TruncVT, NumElts), Ops); +} + +static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::SIGN_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, true)) + return true; + return false; +} + +static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() == ISD::ZERO_EXTEND) + return true; + if (isExtendedBUILD_VECTOR(N, DAG, false)) + return true; + return false; +} + +static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isSignExtended(N0, DAG) && isSignExtended(N1, DAG); + } + return false; +} + +static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::ADD || Opcode == ISD::SUB) { + SDNode *N0 = N->getOperand(0).getNode(); + SDNode *N1 = N->getOperand(1).getNode(); + return N0->hasOneUse() && N1->hasOneUse() && + isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); + } + return false; +} + +static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { + // Multiplications are only custom-lowered for 128-bit vectors so that + // VMULL can be detected. Otherwise v2i64 multiplications are not legal. + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MUL"); + SDNode *N0 = Op.getOperand(0).getNode(); + SDNode *N1 = Op.getOperand(1).getNode(); + unsigned NewOpc = 0; + bool isMLA = false; + bool isN0SExt = isSignExtended(N0, DAG); + bool isN1SExt = isSignExtended(N1, DAG); + if (isN0SExt && isN1SExt) + NewOpc = AArch64ISD::SMULL; + else { + bool isN0ZExt = isZeroExtended(N0, DAG); + bool isN1ZExt = isZeroExtended(N1, DAG); + if (isN0ZExt && isN1ZExt) + NewOpc = AArch64ISD::UMULL; + else if (isN1SExt || isN1ZExt) { + // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these + // into (s/zext A * s/zext C) + (s/zext B * s/zext C) + if (isN1SExt && isAddSubSExt(N0, DAG)) { + NewOpc = AArch64ISD::SMULL; + isMLA = true; + } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { + std::swap(N0, N1); + NewOpc = AArch64ISD::UMULL; + isMLA = true; + } + } + + if (!NewOpc) { + if (VT == MVT::v2i64) + // Fall through to expand this. It is not legal. + return SDValue(); + else + // Other vector multiplications are legal. + return Op; + } + } + + // Legalize to a S/UMULL instruction + SDLoc DL(Op); + SDValue Op0; + SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); + if (!isMLA) { + Op0 = skipExtensionForVectorMULL(N0, DAG); + assert(Op0.getValueType().is64BitVector() && + Op1.getValueType().is64BitVector() && + "unexpected types for extended operands to VMULL"); + return DAG.getNode(NewOpc, DL, VT, Op0, Op1); + } + // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during + // isel lowering to take advantage of no-stall back to back s/umul + s/umla. + // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 + SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); + SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); + EVT Op1VT = Op1.getValueType(); + return DAG.getNode(N0->getOpcode(), DL, VT, + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), + DAG.getNode(NewOpc, DL, VT, + DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); +} SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -1629,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::MUL: + return LowerMUL(Op, DAG); } } @@ -1643,8 +1982,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { #include "AArch64GenCallingConv.inc" -/// Selects the correct CCAssignFn for a the given CallingConvention -/// value. +/// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { @@ -1669,8 +2007,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and @@ -1774,10 +2112,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; + unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; - if (ArgSize < 8 && !Subtarget->isLittleEndian()) + if (!Subtarget->isLittleEndian() && ArgSize < 8 && + !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); @@ -1809,7 +2148,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(FI), - MemVT, false, false, false, nullptr); + MemVT, false, false, false, 0); InVals.push_back(ArgValue); } @@ -1941,8 +2280,8 @@ SDValue AArch64TargetLowering::LowerCallResult( : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. @@ -2011,6 +2350,21 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; } + // Externally-defined functions with weak linkage should not be + // tail-called on AArch64 when the OS does not support dynamic + // pre-emption of symbols, as the AAELF spec requires normal calls + // to undefined weak functions to be replaced with a NOP or jump to the + // next instruction. The behaviour of branch instructions in this + // situation (as used for tail calls) is implementation-defined, so we + // cannot rely on the linker replacing the tail call with a return. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = G->getGlobal(); + const Triple TT(getTargetMachine().getTargetTriple()); + if (GV->hasExternalWeakLinkage() && + (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) + return false; + } + // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. @@ -2028,8 +2382,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -2041,13 +2395,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); if (RVLocs1.size() != RVLocs2.size()) @@ -2072,8 +2426,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return true; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); @@ -2170,8 +2524,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); if (IsVarArg) { // Handle fixed and variable vector arguments differently. @@ -2316,9 +2670,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getLocVT().getSizeInBits(); + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; - if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { + if (!Subtarget->isLittleEndian() && !Flags.isByVal() && + !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } @@ -2350,8 +2705,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVolatile = */ false, - /*alwaysInline = */ false, DstInfo, MachinePointerInfo()); + /*isVol = */ false, + /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { @@ -2440,7 +2795,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(TRI); if (IsThisReturn) { @@ -2494,7 +2850,7 @@ bool AArch64TargetLowering::CanLowerReturn( ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } @@ -2508,8 +2864,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. @@ -2560,7 +2916,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(); SDLoc DL(Op); - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = GN->getGlobal(); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); @@ -2575,6 +2932,25 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); } + if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { + assert(getTargetMachine().getCodeModel() == CodeModel::Small && + "use of MO_CONSTPOOL only supported on small model"); + SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; + SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); + SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(), + /*isVolatile=*/ false, + /*isNonTemporal=*/ true, + /*isInvariant=*/ true, 8); + if (GN->getOffset() != 0) + return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, + DAG.getConstant(GN->getOffset(), PtrVT)); + return GlobalAddr; + } + if (getTargetMachine().getCodeModel() == CodeModel::Large) { const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( @@ -2651,7 +3027,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(TRI); const uint32_t *Mask = ARI->getTLSCallPreservedMask(); @@ -2701,7 +3078,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const TargetRegisterInfo *TRI = + getTargetMachine().getSubtargetImpl()->getRegisterInfo(); const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(TRI); const uint32_t *Mask = ARI->getTLSCallPreservedMask(); @@ -2916,11 +3294,6 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); - - // TBZ only operates on i64's, but the ext should be free. - if (Test.getValueType() == MVT::i32) - Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); - return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); } @@ -2936,18 +3309,29 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); - - // TBNZ only operates on i64's, but the ext should be free. - if (Test.getValueType() == MVT::i32) - Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); - return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); + } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { + // Don't combine AND since emitComparison converts the AND to an ANDS + // (a.k.a. TST) and the test in the test bit and branch instruction + // becomes redundant. This would also increase register pressure. + uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, + DAG.getConstant(Mask, MVT::i64), Dest); } } + if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && + LHS.getOpcode() != ISD::AND) { + // Don't combine AND since emitComparison converts the AND to an ANDS + // (a.k.a. TST) and the test in the test bit and branch instruction + // becomes redundant. This would also increase register pressure. + uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; + return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, + DAG.getConstant(Mask, MVT::i64), Dest); + } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); @@ -3062,6 +3446,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) return SDValue(); + if (!Subtarget->hasNEON()) + return SDValue(); + // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from @@ -4013,8 +4400,10 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( return; case 'J': { uint64_t NVal = -C->getSExtValue(); - if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) + if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { + CVal = C->getSExtValue(); break; + } return; } // The K and L constraints apply *only* to logical immediates, including @@ -4138,10 +4527,30 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - SmallVector<SDValue, 2> SourceVecs; - SmallVector<unsigned, 2> MinElts; - SmallVector<unsigned, 2> MaxElts; + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) @@ -4152,133 +4561,155 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, return SDValue(); } - // Record this extraction against the appropriate vector if possible... + // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) + // are involved. + if (Sources.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = { 0, 0 }; - int OffsetMultipliers[2] = { 1, 1 }; - - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements(); - SDValue CurSource = SourceVecs[i]; - if (SourceVecs[i].getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // It may hit this case if SourceVecs[i] is AssertSext/AssertZext. - // Then bitcast it to the vector which holds asserted element type, - // and record the multiplier of element width between SourceVecs and - // Build_vector which is needed to extract the correct lanes later. - EVT CastVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - SourceVecs[i].getValueSizeInBits() / - VT.getVectorElementType().getSizeInBits()); - - CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]); - OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts; - NumSrcElts *= OffsetMultipliers[i]; - MaxElts[i] *= OffsetMultipliers[i]; - MinElts[i] *= OffsetMultipliers[i]; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) { + SmallestEltTy = SrcEltTy; } + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); - if (CurSource.getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = CurSource; - VEXTOffsets[i] = 0; + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; - } else if (NumSrcElts < NumElts) { + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); + + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); // We can pad out the smaller vector for free, so if it's part of a // shuffle... - ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource, - DAG.getUNDEF(CurSource.getValueType())); + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(NumSrcElts == 2 * NumElts && - "unexpected vector sizes in ReconstructShuffle"); + assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); - if (MaxElts[i] - MinElts[i] >= NumElts) { + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } - if (MinElts[i] >= NumElts) { + if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); - } else if (MaxElts[i] < NumElts) { + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, MVT::i64)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); } else { // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(0)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource, - DAG.getIntPtrConstant(NumElts)); - unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); - ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(Imm, MVT::i32)); + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, MVT::i64)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, MVT::i64)); + unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); + + Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); + Src.WindowBase = -Src.MinElt; } } - SmallVector<int, 8> Mask; - - for (unsigned i = 0; i < NumElts; ++i) { + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); + if (Entry.getOpcode() == ISD::UNDEF) continue; - } - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = - cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts - - VEXTOffsets[1]); - } + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); - return SDValue(); + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } // check if an EXT instruction can handle the shuffle mask when the @@ -4607,7 +5038,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 - if (VT.getVectorElementType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16 || + VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); @@ -4727,7 +5159,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; - if (EltType == MVT::i16) + if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; @@ -4857,7 +5289,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); - if (ScalarVT.getSizeInBits() < 32) + + if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( @@ -4945,7 +5378,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -4954,7 +5387,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -4963,7 +5396,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -4972,7 +5405,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -4981,7 +5414,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -4990,7 +5423,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5145,7 +5578,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -5154,7 +5587,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -5163,7 +5596,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -5172,7 +5605,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -5181,7 +5614,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -5190,7 +5623,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5263,13 +5696,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, if (VT.getSizeInBits() == 128) { SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // Support the V64 version via subregister insertion. SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { @@ -5278,7 +5711,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -5287,7 +5720,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -5296,7 +5729,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -5305,7 +5738,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -5314,7 +5747,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -5323,7 +5756,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { @@ -5332,7 +5765,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(264, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { @@ -5341,7 +5774,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(272, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { @@ -5349,7 +5782,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // The few faces of FMOV... @@ -5358,7 +5791,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && @@ -5366,7 +5799,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } // The many faces of MVNI... @@ -5377,7 +5810,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { @@ -5386,7 +5819,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { @@ -5395,7 +5828,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { @@ -5404,7 +5837,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { @@ -5413,7 +5846,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { @@ -5422,7 +5855,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { @@ -5431,7 +5864,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(264, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { @@ -5440,7 +5873,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, DAG.getConstant(CnstVal, MVT::i32), DAG.getConstant(272, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } @@ -5616,11 +6049,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value @@ -5649,11 +6083,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || + VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) + VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform extraction by expanding the value @@ -6187,7 +6622,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast))) + (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; return Size >= 8 ? MVT::i64 : MVT::i32; @@ -6382,6 +6817,48 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, return performIntegerAbsCombine(N, DAG); } +SDValue +AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + std::vector<SDNode *> *Created) const { + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + if ((VT != MVT::i32 && VT != MVT::i64) || + !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + unsigned Lg2 = Divisor.countTrailingZeros(); + SDValue Zero = DAG.getConstant(0, VT); + SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT); + + // Add (N0 < 0) ? Pow2 - 1 : 0; + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); + + if (Created) { + Created->push_back(Cmp.getNode()); + Created->push_back(Add.getNode()); + Created->push_back(CSel.getNode()); + } + + // Divide by pow2. + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64)); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (Divisor.isNonNegative()) + return SRA; + + if (Created) + Created->push_back(SRA.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -6459,14 +6936,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); - // Now check that the other operand of the AND is a constant splat. We could + // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { - // Bail out if the vector isn't a constant splat. - if (!BV->getConstantSplatNode()) + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. @@ -6486,7 +6963,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, return SDValue(); } -static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); @@ -6505,7 +6983,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move UOP and improve throughput. SDValue N0 = N->getOperand(0); - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast<LoadSDNode>(N0)->isVolatile()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); @@ -7266,11 +7744,11 @@ static SDValue performExtendCombine(SDNode *N, // If the vector type isn't a simple VT, it's beyond the scope of what // we're worried about here. Let legalization do its thing and hope for // the best. - if (!ResVT.isSimple()) + SDValue Src = N->getOperand(0); + EVT SrcVT = Src->getValueType(0); + if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); - SDValue Src = N->getOperand(0); - MVT SrcVT = Src->getValueType(0).getSimpleVT(); // If the source VT is a 64-bit vector, we can play games and get the // better results we want. if (SrcVT.getSizeInBits() != 64) @@ -7294,9 +7772,9 @@ static SDValue performExtendCombine(SDNode *N, EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(InNVT.getVectorNumElements())); + DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); @@ -7418,9 +7896,9 @@ static SDValue performSTORECombine(SDNode *N, EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(0)); + DAG.getConstant(0, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(NumElts)); + DAG.getConstant(NumElts, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), @@ -7504,7 +7982,7 @@ static SDValue performPostLD1Combine(SDNode *N, Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; - SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3)); + SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, @@ -7634,7 +8112,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N, Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain - SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2)); + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, @@ -7655,10 +8133,272 @@ static SDValue performNEONPostLDSTCombine(SDNode *N, return SDValue(); } +// Checks to see if the value is the prescribed width and returns information +// about its extension mode. +static +bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { + ExtType = ISD::NON_EXTLOAD; + switch(V.getNode()->getOpcode()) { + default: + return false; + case ISD::LOAD: { + LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); + if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) + || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { + ExtType = LoadNode->getExtensionType(); + return true; + } + return false; + } + case ISD::AssertSext: { + VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8 && width == 8) + || (TypeNode->getVT() == MVT::i16 && width == 16)) { + ExtType = ISD::SEXTLOAD; + return true; + } + return false; + } + case ISD::AssertZext: { + VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); + if ((TypeNode->getVT() == MVT::i8 && width == 8) + || (TypeNode->getVT() == MVT::i16 && width == 16)) { + ExtType = ISD::ZEXTLOAD; + return true; + } + return false; + } + case ISD::Constant: + case ISD::TargetConstant: { + if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < + 1LL << (width - 1)) + return true; + return false; + } + } + + return true; +} + +// This function does a whole lot of voodoo to determine if the tests are +// equivalent without and with a mask. Essentially what happens is that given a +// DAG resembling: +// +// +-------------+ +-------------+ +-------------+ +-------------+ +// | Input | | AddConstant | | CompConstant| | CC | +// +-------------+ +-------------+ +-------------+ +-------------+ +// | | | | +// V V | +----------+ +// +-------------+ +----+ | | +// | ADD | |0xff| | | +// +-------------+ +----+ | | +// | | | | +// V V | | +// +-------------+ | | +// | AND | | | +// +-------------+ | | +// | | | +// +-----+ | | +// | | | +// V V V +// +-------------+ +// | CMP | +// +-------------+ +// +// The AND node may be safely removed for some combinations of inputs. In +// particular we need to take into account the extension type of the Input, +// the exact values of AddConstant, CompConstant, and CC, along with the nominal +// width of the input (this can work for any width inputs, the above graph is +// specific to 8 bits. +// +// The specific equations were worked out by generating output tables for each +// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The +// problem was simplified by working with 4 bit inputs, which means we only +// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero +// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 +// patterns present in both extensions (0,7). For every distinct set of +// AddConstant and CompConstants bit patterns we can consider the masked and +// unmasked versions to be equivalent if the result of this function is true for +// all 16 distinct bit patterns of for the current extension type of Input (w0). +// +// sub w8, w0, w1 +// and w10, w8, #0x0f +// cmp w8, w2 +// cset w9, AArch64CC +// cmp w10, w2 +// cset w11, AArch64CC +// cmp w9, w11 +// cset w0, eq +// ret +// +// Since the above function shows when the outputs are equivalent it defines +// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and +// would be expensive to run during compiles. The equations below were written +// in a test harness that confirmed they gave equivalent outputs to the above +// for all inputs function, so they can be used determine if the removal is +// legal instead. +// +// isEquivalentMaskless() is the code for testing if the AND can be removed +// factored out of the DAG recognition as the DAG can take several forms. + +static +bool isEquivalentMaskless(unsigned CC, unsigned width, + ISD::LoadExtType ExtType, signed AddConstant, + signed CompConstant) { + // By being careful about our equations and only writing the in term + // symbolic values and well known constants (0, 1, -1, MaxUInt) we can + // make them generally applicable to all bit widths. + signed MaxUInt = (1 << width); + + // For the purposes of these comparisons sign extending the type is + // equivalent to zero extending the add and displacing it by half the integer + // width. Provided we are careful and make sure our equations are valid over + // the whole range we can just adjust the input and avoid writing equations + // for sign extended inputs. + if (ExtType == ISD::SEXTLOAD) + AddConstant -= (1 << (width-1)); + + switch(CC) { + case AArch64CC::LE: + case AArch64CC::GT: { + if ((AddConstant == 0) || + (CompConstant == MaxUInt - 1 && AddConstant < 0) || + (AddConstant >= 0 && CompConstant < 0) || + (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) + return true; + } break; + case AArch64CC::LT: + case AArch64CC::GE: { + if ((AddConstant == 0) || + (AddConstant >= 0 && CompConstant <= 0) || + (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) + return true; + } break; + case AArch64CC::HI: + case AArch64CC::LS: { + if ((AddConstant >= 0 && CompConstant < 0) || + (AddConstant <= 0 && CompConstant >= -1 && + CompConstant < AddConstant + MaxUInt)) + return true; + } break; + case AArch64CC::PL: + case AArch64CC::MI: { + if ((AddConstant == 0) || + (AddConstant > 0 && CompConstant <= 0) || + (AddConstant < 0 && CompConstant <= AddConstant)) + return true; + } break; + case AArch64CC::LO: + case AArch64CC::HS: { + if ((AddConstant >= 0 && CompConstant <= 0) || + (AddConstant <= 0 && CompConstant >= 0 && + CompConstant <= AddConstant + MaxUInt)) + return true; + } break; + case AArch64CC::EQ: + case AArch64CC::NE: { + if ((AddConstant > 0 && CompConstant < 0) || + (AddConstant < 0 && CompConstant >= 0 && + CompConstant < AddConstant + MaxUInt) || + (AddConstant >= 0 && CompConstant >= 0 && + CompConstant >= AddConstant) || + (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) + + return true; + } break; + case AArch64CC::VS: + case AArch64CC::VC: + case AArch64CC::AL: + case AArch64CC::NV: + return true; + case AArch64CC::Invalid: + break; + } + + return false; +} + +static +SDValue performCONDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, unsigned CCIndex, + unsigned CmpIndex) { + unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); + SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); + unsigned CondOpcode = SubsNode->getOpcode(); + + if (CondOpcode != AArch64ISD::SUBS) + return SDValue(); + + // There is a SUBS feeding this condition. Is it fed by a mask we can + // use? + + SDNode *AndNode = SubsNode->getOperand(0).getNode(); + unsigned MaskBits = 0; + + if (AndNode->getOpcode() != ISD::AND) + return SDValue(); + + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { + uint32_t CNV = CN->getZExtValue(); + if (CNV == 255) + MaskBits = 8; + else if (CNV == 65535) + MaskBits = 16; + } + + if (!MaskBits) + return SDValue(); + + SDValue AddValue = AndNode->getOperand(0); + + if (AddValue.getOpcode() != ISD::ADD) + return SDValue(); + + // The basic dag structure is correct, grab the inputs and validate them. + + SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); + SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); + SDValue SubsInputValue = SubsNode->getOperand(1); + + // The mask is present and the provenance of all the values is a smaller type, + // lets see if the mask is superfluous. + + if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || + !isa<ConstantSDNode>(SubsInputValue.getNode())) + return SDValue(); + + ISD::LoadExtType ExtType; + + if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || + !checkValueWidth(AddInputValue2, MaskBits, ExtType) || + !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) + return SDValue(); + + if(!isEquivalentMaskless(CC, MaskBits, ExtType, + cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), + cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) + return SDValue(); + + // The AND is not necessary, remove it. + + SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), + SubsNode->getValueType(1)); + SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; + + SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); + DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); + + return SDValue(N, 0); +} + // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); + if (NV.getNode()) + N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue CCVal = N->getOperand(2); @@ -7747,21 +8487,29 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); - if (!N->getOperand(1).getValueType().isVector()) + if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) return SDValue(); - if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) + // If NumMaskElts == 0, the comparison is larger than select result. The + // largest real NEON comparison is 64-bits per lane, which means the result is + // at most 32-bits and an illegal vector. Just bail out for now. + EVT SrcVT = N0.getOperand(0).getValueType(); + + // Don't try to do this optimization when the setcc itself has i1 operands. + // There are no legal vectors of i1, so this would be pointless. + if (SrcVT == MVT::i1) return SDValue(); - SDLoc DL(N0); + int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); + if (!ResVT.isVector() || NumMaskElts == 0) + return SDValue(); - EVT SrcVT = N0.getOperand(0).getValueType(); - SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, - ResVT.getSizeInBits() / SrcVT.getSizeInBits()); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // First perform a vector comparison, where lane 0 is the one we're interested // in. + SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = @@ -7771,8 +8519,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { // Now duplicate the comparison mask we want across all other lanes. SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); - Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), - Mask); + Mask = DAG.getNode(ISD::BITCAST, DL, + ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } @@ -7792,7 +8540,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG); + return performIntToFpCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -7813,6 +8561,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::CSEL: + return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); case ISD::INSERT_VECTOR_ELT: @@ -7968,13 +8718,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts( static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::i16) - return; - SDLoc DL(N); SDValue Op = N->getOperand(0); - assert(Op.getValueType() == MVT::f16 && - "Inconsistent bitcast? Only 16-bit types should be i16 or f16"); + + if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) + return; + Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, @@ -8000,17 +8749,14 @@ void AArch64TargetLowering::ReplaceNodeResults( } } -bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { - // Loads and stores less than 128-bits are already atomic; ones above that - // are doomed anyway, so defer to the default libcall and blame the OS when - // things go wrong: - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128; - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - return LI->getType()->getPrimitiveSizeInBits() == 128; +bool AArch64TargetLowering::useLoadStackGuardNode() const { + return true; +} - // For the real atomic operations, we have ldxr/stxr up to 128 bits. - return Inst->getType()->getPrimitiveSizeInBits() <= 128; +bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal if there are three or more FDIVs. + return NumUsers > 2; } TargetLoweringBase::LegalizeTypeAction @@ -8025,12 +8771,37 @@ AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); + return Size == 128; +} + +// Loads and stores less than 128-bits are already atomic; ones above that +// are doomed anyway, so defer to the default libcall and blame the OS when +// things go wrong. +bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + unsigned Size = LI->getType()->getPrimitiveSizeInBits(); + return Size == 128; +} + +// For the real atomic operations, we have ldxr/stxr up to 128 bits, +bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + return Size <= 128; +} + +bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { + return true; +} + Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); - bool IsAcquire = - Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent; + bool IsAcquire = isAtLeastAcquire(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a @@ -8065,8 +8836,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = - Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent; + bool IsRelease = isAtLeastRelease(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form @@ -8093,3 +8863,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Val, Stxr->getFunctionType()->getParamType(0)), Addr); } + +bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { + return Ty->isArrayTy(); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index cb0b9ef..cc25bed 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H -#define LLVM_TARGET_AArch64_ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -162,6 +162,16 @@ enum { SITOF, UITOF, + /// Natural vector cast. ISD::BITCAST is not natural in the big-endian + /// world w.r.t vectors; which causes additional REV instructions to be + /// generated to compensate for the byte-swapping. But sometimes we do + /// need to re-interpret the data in SIMD vector registers in big-endian + /// mode without emitting such REV instructions. + NVCAST, + + SMULL, + UMULL, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -197,10 +207,9 @@ class AArch64TargetLowering : public TargetLowering { bool RequireStrictAlign; public: - explicit AArch64TargetLowering(TargetMachine &TM); + explicit AArch64TargetLowering(const TargetMachine &TM); - /// Selects the correct CCAssignFn for a the given CallingConvention - /// value. + /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; /// computeKnownBitsForTargetNode - Determine which of the bits specified in @@ -212,10 +221,11 @@ public: MVT getScalarShiftAmountTy(EVT LHSTy) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, - bool *Fast = nullptr) const override { + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, + unsigned Align = 1, + bool *Fast = nullptr) const override { if (RequireStrictAlign) return false; // FIXME: True for Cyclone, but not necessary others. @@ -317,13 +327,17 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicInIR(Instruction *Inst) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -424,6 +438,10 @@ private: SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + std::vector<SDNode *> *Created) const override; + bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + ConstraintType getConstraintType(const std::string &Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT) const override; @@ -455,6 +473,10 @@ private: void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + + bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, + CallingConv::ID CallConv, + bool isVarArg) const override; }; namespace AArch64 { @@ -464,4 +486,4 @@ FastISel *createFastISel(FunctionLoweringInfo &funcInfo, } // end namespace llvm -#endif // LLVM_TARGET_AArch64_ISELLOWERING_H +#endif diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index 3b9e3c6..4923a11 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -29,8 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; class acquiring_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - assert(Ordering != AcquireRelease && "unexpected load ordering"); - return Ordering == Acquire || Ordering == SequentiallyConsistent; + return isAtLeastAcquire(Ordering); }]>; // An atomic load operation that does not need either acquire or release @@ -38,7 +37,7 @@ class acquiring_load<PatFrag base> class relaxed_load<PatFrag base> : PatFrag<(ops node:$ptr), (base node:$ptr), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return Ordering == Monotonic || Ordering == Unordered; + return !isAtLeastAcquire(Ordering); }]>; // 8-bit loads @@ -114,14 +113,14 @@ class releasing_store<PatFrag base> : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); assert(Ordering != AcquireRelease && "unexpected store ordering"); - return Ordering == Release || Ordering == SequentiallyConsistent; + return isAtLeastRelease(Ordering); }]>; // An atomic store operation that doesn't actually need to be atomic on AArch64. class relaxed_store<PatFrag base> : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering(); - return Ordering == Monotonic || Ordering == Unordered; + return !isAtLeastRelease(Ordering); }]>; // 8-bit stores diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td index e88c0c0..d295c02 100644 --- a/lib/Target/AArch64/AArch64InstrFormats.td +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -843,7 +843,7 @@ def MRSSystemRegisterOperand : AsmOperandClass { let ParserMethod = "tryParseSysReg"; let DiagnosticType = "MRS"; } -// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate. +// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate. def mrs_sysreg_op : Operand<i32> { let ParserMatchClass = MRSSystemRegisterOperand; let DecoderMethod = "DecodeMRSSystemRegister"; @@ -863,9 +863,8 @@ def msr_sysreg_op : Operand<i32> { class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { - bits<15> systemreg; - let Inst{20} = 1; - let Inst{19-5} = systemreg; + bits<16> systemreg; + let Inst{20-5} = systemreg; } // FIXME: Some of these def NZCV, others don't. Best way to model that? @@ -873,9 +872,8 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), // would do it, but feels like overkill at this point. class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), "msr", "\t$systemreg, $Rt"> { - bits<15> systemreg; - let Inst{20} = 1; - let Inst{19-5} = systemreg; + bits<16> systemreg; + let Inst{20-5} = systemreg; } def SystemPStateFieldOperand : AsmOperandClass { @@ -1351,14 +1349,15 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, } multiclass MulAccum<bit isSub, string asm, SDNode AccNode> { + // MADD/MSUB generation is decided by MachineCombiner.cpp def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm, - [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>, + [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>, Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 0; } def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm, - [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>, + [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>, Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } @@ -1636,7 +1635,7 @@ class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, multiclass AddSub<bit isSub, string mnemonic, SDPatternOperator OpNode = null_frag> { - let hasSideEffects = 0 in { + let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in { // Add/Subtract immediate def Wri : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32, mnemonic, OpNode> { @@ -1961,14 +1960,14 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, string Alias> { - let AddedComplexity = 6 in + let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic, [(set GPR32sp:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> { let Inst{31} = 0; let Inst{22} = 0; // 64-bit version has an additional bit of immediate. } - let AddedComplexity = 6 in + let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic, [(set GPR64sp:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> { @@ -2013,8 +2012,10 @@ class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode> // Split from LogicalImm as not all instructions have both. multiclass LogicalReg<bits<2> opc, bit N, string mnemonic, SDPatternOperator OpNode> { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>; def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>; + } def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, [(set GPR32:$Rd, (OpNode GPR32:$Rn, @@ -2995,7 +2996,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, : BaseLoadStorePreIdx<sz, V, opc, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, - "$Rn = $wback", []>, + "$Rn = $wback,@earlyclobber $wback", []>, Sched<[WriteLD, WriteAdr]>; let mayStore = 1, mayLoad = 0 in @@ -3004,7 +3005,7 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, : BaseLoadStorePreIdx<sz, V, opc, (outs GPR64sp:$wback), (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), - asm, "$Rn = $wback", + asm, "$Rn = $wback,@earlyclobber $wback", [(set GPR64sp:$wback, (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>, Sched<[WriteAdr, WriteST]>; @@ -3014,7 +3015,6 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, // Load/store post-indexed //--- -// (pre-index) load/stores. class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops, string asm, string cstr, list<dag> pat> : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> { @@ -3042,7 +3042,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, : BaseLoadStorePostIdx<sz, V, opc, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), - asm, "$Rn = $wback", []>, + asm, "$Rn = $wback,@earlyclobber $wback", []>, Sched<[WriteLD, WriteI]>; let mayStore = 1, mayLoad = 0 in @@ -3051,7 +3051,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, : BaseLoadStorePostIdx<sz, V, opc, (outs GPR64sp:$wback), (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset), - asm, "$Rn = $wback", + asm, "$Rn = $wback,@earlyclobber $wback", [(set GPR64sp:$wback, (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>, Sched<[WriteAdr, WriteST, ReadAdrBase]>; @@ -3115,7 +3115,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype, // (pre-indexed) class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops, string asm> - : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> { + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> { bits<5> Rt; bits<5> Rt2; bits<5> Rn; @@ -3156,7 +3156,7 @@ class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype, class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops, string asm> - : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> { + : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> { bits<5> Rt; bits<5> Rt2; bits<5> Rn; @@ -4383,7 +4383,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size, } multiclass SIMDVectorLShiftLongBySizeBHS { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, "shll", ".8h", ".8b", "8">; def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, @@ -5260,6 +5260,10 @@ multiclass SIMDZipVector<bits<3>opc, string asm, def v2i64 : BaseSIMDZipVector<0b111, opc, V128, asm, ".2d", OpNode, v2i64>; + def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)), + (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>; + def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)), + (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>; def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>; def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index ce85b2c..e582ed4 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64MachineCombinerPattern.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -260,8 +261,9 @@ void AArch64InstrInfo::instantiateCondBranch( BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); } else { // Folded compare-and-branch + // Note that we use addOperand instead of addReg to keep the flags. const MachineInstrBuilder MIB = - BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg()); + BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]); if (Cond.size() > 3) MIB.addImm(Cond[3].getImm()); MIB.addMBB(TBB); @@ -541,6 +543,51 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC); } +// FIXME: this implementation should be micro-architecture dependent, so a +// micro-architecture target hook should be introduced here in future. +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { + if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53()) + return MI->isAsCheapAsAMove(); + + switch (MI->getOpcode()) { + default: + return false; + + // add/sub on register without shift + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return (MI->getOperand(3).getImm() == 0); + + // logical ops on immediate + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + return true; + + // logical ops on register without shift + case AArch64::ANDWrr: + case AArch64::ANDXrr: + case AArch64::BICWrr: + case AArch64::BICXrr: + case AArch64::EONWrr: + case AArch64::EONXrr: + case AArch64::EORWrr: + case AArch64::EORXrr: + case AArch64::ORNWrr: + case AArch64::ORNXrr: + case AArch64::ORRWrr: + case AArch64::ORRXrr: + return true; + } + + llvm_unreachable("Unknown opcode to check as cheap as a move!"); +} + bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const { @@ -561,6 +608,42 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } } +bool +AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned BaseRegA = 0, BaseRegB = 0; + int OffsetA = 0, OffsetB = 0; + int WidthA = 0, WidthB = 0; + + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must be a store or a load"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must be a store or a load"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || + MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // Retrieve the base register, offset from the base register and width. Width + // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If + // base registers are identical, and the offset of a lower memory access + + // the width doesn't overlap the offset of a higher memory access, + // then the memory accesses are different. + if (getLdStBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) && + getLdStBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) { + if (BaseRegA == BaseRegB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + if (LowOffset + LowWidth <= HighOffset) + return true; + } + } + return false; +} + /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. @@ -595,7 +678,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, SrcReg = MI->getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; - CmpValue = MI->getOperand(2).getImm(); + // FIXME: In order to convert CmpValue to 0 or 1 + CmpValue = (MI->getOperand(2).getImm() != 0); return true; case AArch64::ANDSWri: case AArch64::ANDSXri: @@ -604,9 +688,14 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, SrcReg = MI->getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; - CmpValue = AArch64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), - MI->getOpcode() == AArch64::ANDSWri ? 32 : 64); + // FIXME:The return val type of decodeLogicalImmediate is uint64_t, + // while the type of CmpValue is int. When converting uint64_t to int, + // the high 32 bits of uint64_t will be lost. + // In fact it causes a bug in spec2006-483.xalancbmk + // CmpValue is only used to compare with zero in OptimizeCompareInstr + CmpValue = (AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), + MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0); return true; } @@ -619,8 +708,8 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); const TargetMachine *TM = &MF->getTarget(); - const TargetInstrInfo *TII = TM->getInstrInfo(); - const TargetRegisterInfo *TRI = TM->getRegisterInfo(); + const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); + const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; @@ -652,6 +741,87 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { return true; } +/// \brief Return the opcode that does not set flags when possible - otherwise +/// return the original opcode. The caller is responsible to do the actual +/// substitution and legality checking. +static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { + // Don't convert all compare instructions, because for some the zero register + // encoding becomes the sp register. + bool MIDefinesZeroReg = false; + if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR)) + MIDefinesZeroReg = true; + + switch (MI->getOpcode()) { + default: + return MI->getOpcode(); + case AArch64::ADDSWrr: + return AArch64::ADDWrr; + case AArch64::ADDSWri: + return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; + case AArch64::ADDSWrs: + return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; + case AArch64::ADDSWrx: + return AArch64::ADDWrx; + case AArch64::ADDSXrr: + return AArch64::ADDXrr; + case AArch64::ADDSXri: + return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; + case AArch64::ADDSXrs: + return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; + case AArch64::ADDSXrx: + return AArch64::ADDXrx; + case AArch64::SUBSWrr: + return AArch64::SUBWrr; + case AArch64::SUBSWri: + return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; + case AArch64::SUBSWrs: + return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; + case AArch64::SUBSWrx: + return AArch64::SUBWrx; + case AArch64::SUBSXrr: + return AArch64::SUBXrr; + case AArch64::SUBSXri: + return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; + case AArch64::SUBSXrs: + return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; + case AArch64::SUBSXrx: + return AArch64::SUBXrx; + } +} + +/// True when condition code could be modified on the instruction +/// trace starting at from and ending at to. +static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To, + const bool CheckOnlyCCWrites, + const TargetRegisterInfo *TRI) { + // We iterate backward starting \p To until we hit \p From + MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin(); + + // Early exit if To is at the beginning of the BB. + if (I == B) + return true; + + // Check whether the definition of SrcReg is in the same basic block as + // Compare. If not, assume the condition code gets modified on some path. + if (To->getParent() != From->getParent()) + return true; + + // Check that NZCV isn't set on the trace. + for (--I; I != E; --I) { + const MachineInstr &Instr = *I; + + if (Instr.modifiesRegister(AArch64::NZCV, TRI) || + (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI))) + // This instruction modifies or uses NZCV after the one we want to + // change. + return true; + if (I == B) + // We currently don't allow the instruction trace to cross basic + // block boundaries + return true; + } + return false; +} /// optimizeCompareInstr - Convert the instruction supplying the argument to the /// comparison into one that sets the zero bit in the flags register. bool AArch64InstrInfo::optimizeCompareInstr( @@ -661,28 +831,15 @@ bool AArch64InstrInfo::optimizeCompareInstr( // Replace SUBSWrr with SUBWrr if NZCV is not used. int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); if (Cmp_NZCV != -1) { - unsigned NewOpc; - switch (CmpInstr->getOpcode()) { - default: - return false; - case AArch64::ADDSWrr: NewOpc = AArch64::ADDWrr; break; - case AArch64::ADDSWri: NewOpc = AArch64::ADDWri; break; - case AArch64::ADDSWrs: NewOpc = AArch64::ADDWrs; break; - case AArch64::ADDSWrx: NewOpc = AArch64::ADDWrx; break; - case AArch64::ADDSXrr: NewOpc = AArch64::ADDXrr; break; - case AArch64::ADDSXri: NewOpc = AArch64::ADDXri; break; - case AArch64::ADDSXrs: NewOpc = AArch64::ADDXrs; break; - case AArch64::ADDSXrx: NewOpc = AArch64::ADDXrx; break; - case AArch64::SUBSWrr: NewOpc = AArch64::SUBWrr; break; - case AArch64::SUBSWri: NewOpc = AArch64::SUBWri; break; - case AArch64::SUBSWrs: NewOpc = AArch64::SUBWrs; break; - case AArch64::SUBSWrx: NewOpc = AArch64::SUBWrx; break; - case AArch64::SUBSXrr: NewOpc = AArch64::SUBXrr; break; - case AArch64::SUBSXri: NewOpc = AArch64::SUBXri; break; - case AArch64::SUBSXrs: NewOpc = AArch64::SUBXrs; break; - case AArch64::SUBSXrx: NewOpc = AArch64::SUBXrx; break; + if (CmpInstr->definesRegister(AArch64::WZR) || + CmpInstr->definesRegister(AArch64::XZR)) { + CmpInstr->eraseFromParent(); + return true; } - + unsigned Opc = CmpInstr->getOpcode(); + unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); + if (NewOpc == Opc) + return false; const MCInstrDesc &MCID = get(NewOpc); CmpInstr->setDesc(MCID); CmpInstr->RemoveOperand(Cmp_NZCV); @@ -693,6 +850,9 @@ bool AArch64InstrInfo::optimizeCompareInstr( } // Continue only if we have a "ri" where immediate is zero. + // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare + // function. + assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); if (CmpValue != 0 || SrcReg2 != 0) return false; @@ -705,36 +865,10 @@ bool AArch64InstrInfo::optimizeCompareInstr( if (!MI) return false; - // We iterate backward, starting from the instruction before CmpInstr and - // stop when reaching the definition of the source register or done with the - // basic block, to check whether NZCV is used or modified in between. - MachineBasicBlock::iterator I = CmpInstr, E = MI, - B = CmpInstr->getParent()->begin(); - - // Early exit if CmpInstr is at the beginning of the BB. - if (I == B) - return false; - - // Check whether the definition of SrcReg is in the same basic block as - // Compare. If not, we can't optimize away the Compare. - if (MI->getParent() != CmpInstr->getParent()) - return false; - - // Check that NZCV isn't set between the comparison instruction and the one we - // want to change. + bool CheckOnlyCCWrites = false; const TargetRegisterInfo *TRI = &getRegisterInfo(); - for (--I; I != E; --I) { - const MachineInstr &Instr = *I; - - if (Instr.modifiesRegister(AArch64::NZCV, TRI) || - Instr.readsRegister(AArch64::NZCV, TRI)) - // This instruction modifies or uses NZCV after the one we want to - // change. We can't do this transformation. - return false; - if (I == B) - // The 'and' is below the comparison instruction. - return false; - } + if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI)) + return false; unsigned NewOpc = MI->getOpcode(); switch (MI->getOpcode()) { @@ -848,6 +982,56 @@ bool AArch64InstrInfo::optimizeCompareInstr( return true; } +bool +AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD) + return false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Reg = MI->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + const TargetMachine &TM = MBB.getParent()->getTarget(); + unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); + const unsigned char MO_NC = AArch64II::MO_NC; + + if ((OpFlags & AArch64II::MO_GOT) != 0) { + BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT); + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill).addImm(0) + .addMemOperand(*MI->memoperands_begin()); + } else if (TM.getCodeModel() == CodeModel::Large) { + BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) + .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16); + BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill).addImm(0) + .addMemOperand(*MI->memoperands_begin()); + } else { + BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) + .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; + BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) + .addReg(Reg, RegState::Kill) + .addGlobalAddress(GV, 0, LoFlags) + .addMemOperand(*MI->memoperands_begin()); + } + + MBB.erase(MI); + + return true; +} + /// Return true if this is this instruction has a non-zero immediate bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { switch (MI->getOpcode()) { @@ -963,12 +1147,14 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); return true; } + break; case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) if (MI->getOperand(2).getImm() == 0) { assert(MI->getDesc().getNumOperands() == 4 && MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); return true; } + break; } return false; } @@ -991,6 +1177,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { "invalid ORRv16i8 operands"); return true; } + break; } return false; } @@ -1152,6 +1339,102 @@ AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, }; } +bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth( + MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, + const TargetRegisterInfo *TRI) const { + // Handle only loads/stores with base register followed by immediate offset. + if (LdSt->getNumOperands() != 3) + return false; + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + return false; + + // Offset is calculated as the immediate operand multiplied by the scaling factor. + // Unscaled instructions have scaling factor set to 1. + int Scale = 0; + switch (LdSt->getOpcode()) { + default: + return false; + case AArch64::LDURQi: + case AArch64::STURQi: + Width = 16; + Scale = 1; + break; + case AArch64::LDURXi: + case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: + Width = 8; + Scale = 1; + break; + case AArch64::LDURWi: + case AArch64::LDURSi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: + Width = 4; + Scale = 1; + break; + case AArch64::LDURHi: + case AArch64::LDURHHi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + case AArch64::STURHi: + case AArch64::STURHHi: + Width = 2; + Scale = 1; + break; + case AArch64::LDURBi: + case AArch64::LDURBBi: + case AArch64::LDURSBXi: + case AArch64::LDURSBWi: + case AArch64::STURBi: + case AArch64::STURBBi: + Width = 1; + Scale = 1; + break; + case AArch64::LDRXui: + case AArch64::STRXui: + Scale = Width = 8; + break; + case AArch64::LDRWui: + case AArch64::STRWui: + Scale = Width = 4; + break; + case AArch64::LDRBui: + case AArch64::STRBui: + Scale = Width = 1; + break; + case AArch64::LDRHui: + case AArch64::STRHui: + Scale = Width = 2; + break; + case AArch64::LDRSui: + case AArch64::STRSui: + Scale = Width = 4; + break; + case AArch64::LDRDui: + case AArch64::STRDui: + Scale = Width = 8; + break; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = Width = 16; + break; + case AArch64::LDRBBui: + case AArch64::STRBBui: + Scale = Width = 1; + break; + case AArch64::LDRHHui: + case AArch64::STRHHui: + Scale = Width = 2; + break; + }; + + BaseReg = LdSt->getOperand(1).getReg(); + Offset = LdSt->getOperand(2).getImm() * Scale; + return true; +} + /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getLdStBaseRegImmOfs returns true. @@ -1194,16 +1477,15 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, } } -MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, - uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { +MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( + MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, + const MDNode *Expr, DebugLoc DL) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) .addFrameIndex(FrameIx) .addImm(0) .addImm(Offset) - .addMetadata(MDPtr); + .addMetadata(Var) + .addMetadata(Expr); return &*MIB; } @@ -2087,3 +2369,592 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::CreateImm(0)); } +/// useMachineCombiner - return true when a target supports MachineCombiner +bool AArch64InstrInfo::useMachineCombiner() const { + // AArch64 supports the combiner + return true; +} +// +// True when Opc sets flag +static bool isCombineInstrSettingFlag(unsigned Opc) { + switch (Opc) { + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// 32b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate32(unsigned Opc) { + switch (Opc) { + case AArch64::ADDWrr: + case AArch64::ADDWri: + case AArch64::SUBWrr: + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::SUBSWrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBWri: + case AArch64::SUBSWri: + return true; + default: + break; + } + return false; +} +// +// 64b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate64(unsigned Opc) { + switch (Opc) { + case AArch64::ADDXrr: + case AArch64::ADDXri: + case AArch64::SUBXrr: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBXri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate(unsigned Opc) { + return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); +} + +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineInstr *MI = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + MI = MRI.getUniqueVRegDef(MO.getReg()); + // And it needs to be in the trace (otherwise, it won't have a depth). + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) + return false; + + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + + // Must only used by the user we combine with. + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + return false; + + return true; +} + +/// hasPattern - return true when there is potentially a faster code sequence +/// for an instruction chain ending in \p Root. All potential patterns are +/// listed +/// in the \p Pattern vector. Pattern should be sorted in priority order since +/// the pattern evaluator stops checking as soon as it finds a faster sequence. + +bool AArch64InstrInfo::hasPattern( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const { + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + if (!isCombineInstrCandidate(Opc)) + return 0; + if (isCombineInstrSettingFlag(Opc)) { + int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); + // When NZCV is live bail out. + if (Cmp_NZCV == -1) + return 0; + unsigned NewOpc = convertFlagSettingOpcode(&Root); + // When opcode can't change bail out. + // CHECKME: do we miss any cases for opcode conversion? + if (NewOpc == Opc) + return 0; + Opc = NewOpc; + } + + switch (Opc) { + default: + break; + case AArch64::ADDWrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "ADDWrr does not have register operands"); + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Found = true; + } + break; + case AArch64::ADDXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Found = true; + } + break; + case AArch64::SUBWrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Found = true; + } + break; + case AArch64::SUBXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Found = true; + } + break; + case AArch64::ADDWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Found = true; + } + break; + case AArch64::ADDXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Found = true; + } + break; + case AArch64::SUBWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Found = true; + } + break; + case AArch64::SUBXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Found = true; + } + break; + } + return Found; +} + +/// genMadd - Generate madd instruction and combine mul and add. +/// Example: +/// MUL I=A,B,0 +/// ADD R,I,C +/// ==> MADD R,A,B,C +/// \param Root is the ADD instruction +/// \param [out] InsInstrs is a vector of machine instructions and will +/// contain the generated madd instruction +/// \param IdxMulOpd is index of operand in Root that is the result of +/// the MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the madd instruction +static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned SrcReg0 = MUL->getOperand(1).getReg(); + bool Src0IsKill = MUL->getOperand(1).isKill(); + unsigned SrcReg1 = MUL->getOperand(2).getReg(); + bool Src1IsKill = MUL->getOperand(2).isKill(); + unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); + bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + MRI.constrainRegClass(SrcReg0, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + MRI.constrainRegClass(SrcReg1, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) + MRI.constrainRegClass(SrcReg2, RC); + + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), + ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + // Insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} + +/// genMaddR - Generate madd instruction and combine mul and add using +/// an extra virtual register +/// Example - an ADD intermediate needs to be stored in a register: +/// MUL I=A,B,0 +/// ADD R,I,Imm +/// ==> ORR V, ZR, Imm +/// ==> MADD R,A,B,V +/// \param Root is the ADD instruction +/// \param [out] InsInstrs is a vector of machine instructions and will +/// contain the generated madd instruction +/// \param IdxMulOpd is index of operand in Root that is the result of +/// the MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the madd instruction +/// \param VR is a virtual register that holds the value of an ADD operand +/// (V in the example above). +static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + unsigned VR, const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + unsigned ResultReg = Root.getOperand(0).getReg(); + unsigned SrcReg0 = MUL->getOperand(1).getReg(); + bool Src0IsKill = MUL->getOperand(1).isKill(); + unsigned SrcReg1 = MUL->getOperand(2).getReg(); + bool Src1IsKill = MUL->getOperand(2).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg0)) + MRI.constrainRegClass(SrcReg0, RC); + if (TargetRegisterInfo::isVirtualRegister(SrcReg1)) + MRI.constrainRegClass(SrcReg1, RC); + if (TargetRegisterInfo::isVirtualRegister(VR)) + MRI.constrainRegClass(VR, RC); + + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), + ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(VR); + // Insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} + +/// genAlternativeCodeSequence - when hasPattern() finds a pattern +/// this function generates the instructions that could replace the +/// original code sequence +void AArch64InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + MachineInstr *MUL; + const TargetRegisterClass *RC; + unsigned Opc; + switch (Pattern) { + default: + // signal error. + break; + case MachineCombinerPattern::MC_MULADDW_OP1: + case MachineCombinerPattern::MC_MULADDX_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) { + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MC_MULADDW_OP2: + case MachineCombinerPattern::MC_MULADDX_OP2: + // MUL I=A,B,0 + // ADD R,C,I + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) { + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MC_MULADDWI_OP1: + case MachineCombinerPattern::MC_MULADDXI_OP1: { + // MUL I=A,B,0 + // ADD R,I,Imm + // ==> ORR V, ZR, Imm + // ==> MADD R,A,B,V + // --- Create(MADD); + const TargetRegisterClass *OrrRC; + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) { + OrrOpc = AArch64::ORRWri; + OrrRC = &AArch64::GPR32spRegClass; + BitSize = 32; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + OrrOpc = AArch64::ORRXri; + OrrRC = &AArch64::GPR64spRegClass; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(OrrRC); + uint64_t Imm = Root.getOperand(2).getImm(); + + if (Root.getOperand(3).isImm()) { + unsigned Val = Root.getOperand(3).getImm(); + Imm = Imm << Val; + } + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + } + break; + } + case MachineCombinerPattern::MC_MULSUBW_OP1: + case MachineCombinerPattern::MC_MULSUBX_OP1: { + // MUL I=A,B,0 + // SUB R,I, C + // ==> SUB V, 0, C + // ==> MADD R,A,B,V // = -C + A*B + // --- Create(MADD); + const TargetRegisterClass *SubRC; + unsigned SubOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) { + SubOpc = AArch64::SUBWrr; + SubRC = &AArch64::GPR32spRegClass; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + SubOpc = AArch64::SUBXrr; + SubRC = &AArch64::GPR64spRegClass; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(SubRC); + // SUB NewVR, 0, C + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) + .addReg(ZeroReg) + .addOperand(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + break; + } + case MachineCombinerPattern::MC_MULSUBW_OP2: + case MachineCombinerPattern::MC_MULSUBX_OP2: + // MUL I=A,B,0 + // SUB R,C,I + // ==> MSUB R,A,B,C (computes C - A*B) + // --- Create(MSUB); + if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) { + Opc = AArch64::MSUBWrrr; + RC = &AArch64::GPR32RegClass; + } else { + Opc = AArch64::MSUBXrrr; + RC = &AArch64::GPR64RegClass; + } + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MC_MULSUBWI_OP1: + case MachineCombinerPattern::MC_MULSUBXI_OP1: { + // MUL I=A,B,0 + // SUB R,I, Imm + // ==> ORR V, ZR, -Imm + // ==> MADD R,A,B,V // = -Imm + A*B + // --- Create(MADD); + const TargetRegisterClass *OrrRC; + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) { + OrrOpc = AArch64::ORRWri; + OrrRC = &AArch64::GPR32spRegClass; + BitSize = 32; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + RC = &AArch64::GPR32RegClass; + } else { + OrrOpc = AArch64::ORRXri; + OrrRC = &AArch64::GPR64spRegClass; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + RC = &AArch64::GPR64RegClass; + } + unsigned NewVR = MRI.createVirtualRegister(OrrRC); + int Imm = Root.getOperand(2).getImm(); + if (Root.getOperand(3).isImm()) { + unsigned Val = Root.getOperand(3).getImm(); + Imm = Imm << Val; + } + uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); + } + break; + } + } // end switch (Pattern) + // Record MUL and ADD/SUB for deletion + DelInstrs.push_back(MUL); + DelInstrs.push_back(&Root); + + return; +} + +/// \brief Replace csincr-branch sequence by simple conditional branch +/// +/// Examples: +/// 1. +/// csinc w9, wzr, wzr, <condition code> +/// tbnz w9, #0, 0x44 +/// to +/// b.<inverted condition code> +/// +/// 2. +/// csinc w9, wzr, wzr, <condition code> +/// tbz w9, #0, 0x44 +/// to +/// b.<condition code> +/// +/// \param MI Conditional Branch +/// \return True when the simple conditional branch is generated +/// +bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { + bool IsNegativeBranch = false; + bool IsTestAndBranch = false; + unsigned TargetBBInMI = 0; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown branch instruction?"); + case AArch64::Bcc: + return false; + case AArch64::CBZW: + case AArch64::CBZX: + TargetBBInMI = 1; + break; + case AArch64::CBNZW: + case AArch64::CBNZX: + TargetBBInMI = 1; + IsNegativeBranch = true; + break; + case AArch64::TBZW: + case AArch64::TBZX: + TargetBBInMI = 2; + IsTestAndBranch = true; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + TargetBBInMI = 2; + IsNegativeBranch = true; + IsTestAndBranch = true; + break; + } + // So we increment a zero register and test for bits other + // than bit 0? Conservatively bail out in case the verifier + // missed this case. + if (IsTestAndBranch && MI->getOperand(1).getImm()) + return false; + + // Find Definition. + assert(MI->getParent() && "Incomplete machine instruciton\n"); + MachineBasicBlock *MBB = MI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + unsigned VReg = MI->getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + return false; + + MachineInstr *DefMI = MRI->getVRegDef(VReg); + + // Look for CSINC + if (!(DefMI->getOpcode() == AArch64::CSINCWr && + DefMI->getOperand(1).getReg() == AArch64::WZR && + DefMI->getOperand(2).getReg() == AArch64::WZR) && + !(DefMI->getOpcode() == AArch64::CSINCXr && + DefMI->getOperand(1).getReg() == AArch64::XZR && + DefMI->getOperand(2).getReg() == AArch64::XZR)) + return false; + + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + return false; + + AArch64CC::CondCode CC = + (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); + bool CheckOnlyCCWrites = true; + // Convert only when the condition code is not modified between + // the CSINC and the branch. The CC may be used by other + // instructions in between. + if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo())) + return false; + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB(); + DebugLoc DL = MI->getDebugLoc(); + if (IsNegativeBranch) + CC = AArch64CC::getInvertedCondCode(CC); + BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); + MI->eraseFromParent(); + return true; +} diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index f70b82b..d8f1274 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -11,11 +11,12 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_AArch64INSTRINFO_H -#define LLVM_TARGET_AArch64INSTRINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H #include "AArch64.h" #include "AArch64RegisterInfo.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER @@ -46,9 +47,15 @@ public: unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + bool isAsCheapAsAMove(const MachineInstr *MI) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; + bool + areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const override; unsigned isStoreToStackSlot(const MachineInstr *MI, @@ -87,6 +94,10 @@ public: unsigned &Offset, const TargetRegisterInfo *TRI) const override; + bool getLdStBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg, + int &Offset, int &Width, + const TargetRegisterInfo *TRI) const; + bool enableClusterLoads() const override { return true; } bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, @@ -96,8 +107,8 @@ public: MachineInstr *Second) const override; MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const; + uint64_t Offset, const MDNode *Var, + const MDNode *Expr, DebugLoc DL) const; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, @@ -117,6 +128,7 @@ public: int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + using TargetInstrInfo::foldMemoryOperandImpl; MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, @@ -153,7 +165,27 @@ public: bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; - + bool optimizeCondBranch(MachineInstr *MI) const override; + /// hasPattern - return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// listed + /// in the <Pattern> array. + bool hasPattern(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) + const override; + + /// genAlternativeCodeSequence - when hasPattern() finds a pattern + /// this function generates the instructions that could replace the + /// original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + /// useMachineCombiner - AArch64 supports MachineCombiner + bool useMachineCombiner() const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0ba069e..e0fb90a 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -24,6 +24,7 @@ def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; +def IsCyclone : Predicate<"Subtarget->isCyclone()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -236,6 +237,12 @@ def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL", def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", SDT_AArch64WrapperLarge>; +def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>; + +def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; +def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; //===----------------------------------------------------------------------===// @@ -474,6 +481,24 @@ def trunc_imm : SDNodeXForm<imm, [{ def : Pat<(i64 i64imm_32bit:$src), (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; +// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). +def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32); +}]>; + +def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ +return CurDAG->getTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64); +}]>; + + +def : Pat<(f32 fpimm:$in), + (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>; +def : Pat<(f64 fpimm:$in), + (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>; + + // Deal with the various forms of (ELF) large addressing with MOVZ/MOVK // sequences. def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, @@ -632,6 +657,10 @@ def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))), (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))), (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; +def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)), + (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)), + (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; } // AddedComplexity = 7 let AddedComplexity = 5 in { @@ -782,7 +811,7 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; //===----------------------------------------------------------------------===// // Bitfield immediate extraction instruction. //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in defm EXTR : ExtractImm<"extr">; def : InstAlias<"ror $dst, $src, $shift", (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>; @@ -797,7 +826,7 @@ def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)), //===----------------------------------------------------------------------===// // Other bitfield immediate instructions. //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">; defm SBFM : BitfieldImm<0b00, "sbfm">; defm UBFM : BitfieldImm<0b10, "ubfm">; @@ -970,9 +999,9 @@ def : InstAlias<"cneg $dst, $src, $cc", // PC-relative instructions. //===----------------------------------------------------------------------===// let isReMaterializable = 1 in { -let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in { +let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { def ADR : ADRI<0, "adr", adrlabel, []>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def ADRP : ADRI<1, "adrp", adrplabel, [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>; @@ -1173,6 +1202,9 @@ defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>; defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>; defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>; +defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>; +defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>; + defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>; defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>; @@ -1213,6 +1245,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>; defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>; defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>; + defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>; } defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>; @@ -1226,6 +1259,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>; + defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>; defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>; } } // AddedComplexity = 10 @@ -1355,6 +1389,8 @@ let Predicates = [IsLE] in { (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -1376,6 +1412,8 @@ let Predicates = [IsLE] in { (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -1512,6 +1550,8 @@ let Predicates = [IsLE] in { (LDURDi GPR64sp:$Rn, simm9:$offset)>; def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; } def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), (LDURDi GPR64sp:$Rn, simm9:$offset)>; @@ -1532,6 +1572,8 @@ let Predicates = [IsLE] in { (LDURQi GPR64sp:$Rn, simm9:$offset)>; def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; } // anyext -> zext @@ -1828,6 +1870,7 @@ let Predicates = [IsLE] in { defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>; defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>; + defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>; } defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>; @@ -1842,9 +1885,37 @@ let Predicates = [IsLE] in { defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>; defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>; + defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>; } } // AddedComplexity = 10 +// Match stores from lane 0 to the appropriate subreg's store. +multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop, + ValueType VecTy, ValueType STy, + SubRegIndex SubRegIdx, + Instruction STRW, Instruction STRX> { + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 19 in { + defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>; + defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>; + defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>; + defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>; +} + //--- // (unsigned immediate) defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", @@ -1892,6 +1963,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v2i32 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(store (v1f64 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), @@ -1921,6 +1995,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v2i64 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(store (f128 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), @@ -1983,6 +2060,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v2i32 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4f16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; @@ -2013,6 +2093,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v2f64 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8f16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // unscaled i64 truncating stores @@ -2089,6 +2172,8 @@ def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -2102,6 +2187,8 @@ def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //--- // (immediate post-indexed) @@ -2139,6 +2226,8 @@ def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -2152,6 +2241,8 @@ def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //===----------------------------------------------------------------------===// // Load/store exclusive instructions. @@ -2384,6 +2475,28 @@ defm FMOV : FPMoveImmediate<"fmov">; //===----------------------------------------------------------------------===// defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; +def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), + (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), + (ABSv8i8 V64:$src)>; +def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))), + (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))), + (ABSv4i16 V64:$src)>; +def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))), + (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))), + (ABSv2i32 V64:$src)>; +def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))), + (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))), + (ABSv16i8 V128:$src)>; +def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))), + (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))), + (ABSv8i16 V128:$src)>; +def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))), + (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))), + (ABSv4i32 V128:$src)>; +def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))), + (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))), + (ABSv2i64 V128:$src)>; + defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>; @@ -2412,6 +2525,11 @@ def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), (i64 2))))), (FCVTLv4i32 V128:$Rn)>; +def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn), + (i64 4))))), + (FCVTLv8i16 V128:$Rn)>; + defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>; @@ -2423,6 +2541,7 @@ def : Pat<(concat_vectors V64:$Rd, (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>; def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; @@ -2505,6 +2624,10 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte> defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; @@ -3101,6 +3224,46 @@ defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; +// Additional patterns for SMULL and UMULL +multiclass Neon_mul_widen_patterns<SDPatternOperator opnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16, + SMULLv4i16_v4i32, SMULLv2i32_v2i64>; +defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16, + UMULLv4i16_v4i32, UMULLv2i32_v2i64>; + +// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL +multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>; + def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>; +} + +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>, + SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>; +defm : Neon_mulacc_widen_patterns< + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>, + UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>; + // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; @@ -3183,6 +3346,10 @@ def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; // We use EXT to handle extract_subvector to copy the upper 64-bits of a // 128-bit vector. @@ -3194,6 +3361,8 @@ def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))), (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), @@ -3306,6 +3475,19 @@ def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))), (v2f64 (DUPv2i64lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub), (i64 0)))>; +def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), + (v4f16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; +def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), + (v8f16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; + +def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; @@ -3427,6 +3609,23 @@ def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; +def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), + (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -3507,6 +3706,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, dsub)>; } +defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>; defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>; defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>; @@ -3522,6 +3722,8 @@ def : Pat<(vector_extract (v2f64 V128:$Rn), 0), (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; def : Pat<(vector_extract (v4f32 V128:$Rn), 0), (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; +def : Pat<(vector_extract (v8f16 V128:$Rn), 0), + (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), (f64 (EXTRACT_SUBREG (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0, @@ -3532,6 +3734,11 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0, V128:$Rn, VectorIndexS:$idx), ssub))>; +def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), + (f16 (EXTRACT_SUBREG + (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0, + V128:$Rn, VectorIndexH:$idx), + hsub))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -3546,6 +3753,7 @@ def : ConcatPat<v2f64, v1f64>; def : ConcatPat<v4i32, v2i32>; def : ConcatPat<v4f32, v2f32>; def : ConcatPat<v8i16, v4i16>; +def : ConcatPat<v8f16, v4f16>; def : ConcatPat<v16i8, v8i8>; // If the high lanes are undef, though, we can just ignore them: @@ -3965,7 +4173,7 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", // AdvSIMD indexed element //---------------------------------------------------------------------------- -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; } @@ -4386,7 +4594,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST> 0), dsub)), 0), - ssub)))>, Requires<[NotForCodeSize]>; + ssub)))>, Requires<[NotForCodeSize, IsCyclone]>; def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; @@ -4439,8 +4647,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST> 0), dsub)), 0), - dsub)))>, Requires<[NotForCodeSize]>; - + dsub)))>, Requires<[NotForCodeSize, IsCyclone]>; + def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), @@ -4519,7 +4727,7 @@ defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>; defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>; defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>; defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>; -let mayLoad = 1, neverHasSideEffects = 1 in { +let mayLoad = 1, hasSideEffects = 0 in { defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>; defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>; defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>; @@ -4563,6 +4771,10 @@ def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), (LD1Rv2d GPR64sp:$Rn)>; def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), (LD1Rv1d GPR64sp:$Rn)>; +def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex, ValueType VTy, ValueType STy, Instruction LD1> @@ -4576,6 +4788,7 @@ def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>; def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>; def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>; def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>; +def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>; class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex, ValueType VTy, ValueType STy, Instruction LD1> @@ -4590,6 +4803,7 @@ def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>; def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>; def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>; def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>; +def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>; defm LD1 : SIMDLdSt1SingleAliases<"ld1">; @@ -4603,7 +4817,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store @@ -4617,8 +4831,9 @@ def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>; def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>; def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>; def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>; +def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>; -let AddedComplexity = 15 in +let AddedComplexity = 19 in class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1> : Pat<(scalar_store @@ -4631,6 +4846,7 @@ def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>; def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>; def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>; def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>; +def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>; multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1, @@ -4655,6 +4871,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>; defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>; defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>; defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>; +defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>; multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex, ValueType VTy, ValueType STy, Instruction ST1, @@ -4678,8 +4895,9 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>; defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>; defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>; defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>; +defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>; -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>; @@ -4856,10 +5074,77 @@ def : Pat<(trap), (BRK 1)>; // b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX // +// Natural vector casts (64 bit) +def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; + +// Natural vector casts (128 bit) +def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; + +def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; + let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -4868,6 +5153,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -4880,6 +5167,8 @@ def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4f16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -4889,6 +5178,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -4917,6 +5208,7 @@ let Predicates = [IsLE] in { def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -4926,6 +5218,8 @@ def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -4938,6 +5232,7 @@ def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -4950,6 +5245,8 @@ def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), + (v2i32 (REV64v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -4958,6 +5255,7 @@ def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>; } @@ -4970,6 +5268,8 @@ def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 (REV16v8i8 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), @@ -4977,12 +5277,41 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), } let Predicates = [IsLE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), + (v4f16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), + (v4f16 (REV64v4i16 FPR64:$src))>; +} + + + +let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -4997,6 +5326,8 @@ def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 (REV32v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -5004,6 +5335,7 @@ def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -5014,6 +5346,8 @@ def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 (REV64v2i32 FPR64:$src))>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 (REV64v8i8 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -5023,6 +5357,7 @@ def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -5033,6 +5368,8 @@ def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -5043,6 +5380,7 @@ def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -5055,6 +5393,8 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), + (v2f32 (REV64v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -5064,6 +5404,7 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -5075,6 +5416,9 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), @@ -5089,6 +5433,7 @@ let Predicates = [IsLE] in { def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; } @@ -5100,6 +5445,8 @@ def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 (REV64v4i32 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), @@ -5110,6 +5457,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -5120,6 +5468,8 @@ def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (REV64v4i32 FPR128:$src), (i32 8)))>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), @@ -5135,6 +5485,7 @@ def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -5148,6 +5499,8 @@ def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -5157,6 +5510,7 @@ def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -5171,6 +5525,8 @@ def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -5181,6 +5537,7 @@ def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), @@ -5197,6 +5554,36 @@ def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), + (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), + (v8f16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), + (v8f16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), + (v8f16 (REV32v8i16 FPR128:$src))>; } let Predicates = [IsLE] in { @@ -5206,6 +5593,7 @@ def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -5222,6 +5610,8 @@ def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 (REV64v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 (REV32v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), @@ -5245,6 +5635,8 @@ def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)), (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 3df9c4f..8157981 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -13,20 +13,21 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "aarch64-ldst-opt" @@ -108,7 +109,7 @@ private: int getMemSize(MachineInstr *MemMI); }; char AArch64LoadStoreOpt::ID = 0; -} +} // namespace static bool isUnscaledLdst(unsigned Opc) { switch (Opc) { @@ -931,8 +932,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { const TargetMachine &TM = Fn.getTarget(); - TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo()); - TRI = TM.getRegisterInfo(); + TII = static_cast<const AArch64InstrInfo *>( + TM.getSubtargetImpl()->getInstrInfo()); + TRI = TM.getSubtargetImpl()->getRegisterInfo(); bool Modified = false; for (auto &MBB : Fn) diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp index 75a17b9..e57b0f4 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -25,8 +25,7 @@ #include "llvm/Target/TargetMachine.h" using namespace llvm; -AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang, - AsmPrinter &printer) +AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer) : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {} MCSymbol * diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h index ba50ba9..1e29b80 100644 --- a/lib/Target/AArch64/AArch64MCInstLower.h +++ b/lib/Target/AArch64/AArch64MCInstLower.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64_MCINSTLOWER_H -#define AArch64_MCINSTLOWER_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H #include "llvm/ADT/Triple.h" #include "llvm/Support/Compiler.h" @@ -33,7 +33,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower { Triple TargetTriple; public: - AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer); + AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer); bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; void Lower(const MachineInstr *MI, MCInst &OutMI) const; diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h new file mode 100644 index 0000000..4164b33 --- /dev/null +++ b/lib/Target/AArch64/AArch64MachineCombinerPattern.h @@ -0,0 +1,42 @@ +//===- AArch64MachineCombinerPattern.h -===// +//===- AArch64 instruction pattern supported by combiner -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines instruction pattern supported by combiner +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H + +namespace llvm { + +/// Enumeration of instruction pattern supported by machine combiner +/// +/// +namespace MachineCombinerPattern { +enum MC_PATTERN : int { + MC_NONE = 0, + MC_MULADDW_OP1 = 1, + MC_MULADDW_OP2 = 2, + MC_MULSUBW_OP1 = 3, + MC_MULSUBW_OP2 = 4, + MC_MULADDWI_OP1 = 5, + MC_MULSUBWI_OP1 = 6, + MC_MULADDX_OP1 = 7, + MC_MULADDX_OP2 = 8, + MC_MULSUBX_OP1 = 9, + MC_MULSUBX_OP2 = 10, + MC_MULADDXI_OP1 = 11, + MC_MULSUBXI_OP1 = 12 +}; +} // end namespace MachineCombinerPattern +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 7c257ba..536a8d0 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64MACHINEFUNCTIONINFO_H -#define AArch64MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -160,4 +160,4 @@ private: }; } // End llvm namespace -#endif // AArch64MACHINEFUNCTIONINFO_H +#endif diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp new file mode 100644 index 0000000..f942c4e --- /dev/null +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -0,0 +1,383 @@ +//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file contains the AArch64 / Cortex-A57 specific register allocation +// constraints for use by the PBQP register allocator. +// +// It is essentially a transcription of what is contained in +// AArch64A57FPLoadBalancing, which tries to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply-accumulates. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "aarch64-pbqp" + +#include "AArch64.h" +#include "AArch64PBQPRegAlloc.h" +#include "AArch64RegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocPBQP.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +#ifndef NDEBUG +bool isFPReg(unsigned reg) { + return AArch64::FPR32RegClass.contains(reg) || + AArch64::FPR64RegClass.contains(reg) || + AArch64::FPR128RegClass.contains(reg); +} +#endif + +bool isOdd(unsigned reg) { + switch (reg) { + default: + llvm_unreachable("Register is not from the expected class !"); + case AArch64::S1: + case AArch64::S3: + case AArch64::S5: + case AArch64::S7: + case AArch64::S9: + case AArch64::S11: + case AArch64::S13: + case AArch64::S15: + case AArch64::S17: + case AArch64::S19: + case AArch64::S21: + case AArch64::S23: + case AArch64::S25: + case AArch64::S27: + case AArch64::S29: + case AArch64::S31: + case AArch64::D1: + case AArch64::D3: + case AArch64::D5: + case AArch64::D7: + case AArch64::D9: + case AArch64::D11: + case AArch64::D13: + case AArch64::D15: + case AArch64::D17: + case AArch64::D19: + case AArch64::D21: + case AArch64::D23: + case AArch64::D25: + case AArch64::D27: + case AArch64::D29: + case AArch64::D31: + case AArch64::Q1: + case AArch64::Q3: + case AArch64::Q5: + case AArch64::Q7: + case AArch64::Q9: + case AArch64::Q11: + case AArch64::Q13: + case AArch64::Q15: + case AArch64::Q17: + case AArch64::Q19: + case AArch64::Q21: + case AArch64::Q23: + case AArch64::Q25: + case AArch64::Q27: + case AArch64::Q29: + case AArch64::Q31: + return true; + case AArch64::S0: + case AArch64::S2: + case AArch64::S4: + case AArch64::S6: + case AArch64::S8: + case AArch64::S10: + case AArch64::S12: + case AArch64::S14: + case AArch64::S16: + case AArch64::S18: + case AArch64::S20: + case AArch64::S22: + case AArch64::S24: + case AArch64::S26: + case AArch64::S28: + case AArch64::S30: + case AArch64::D0: + case AArch64::D2: + case AArch64::D4: + case AArch64::D6: + case AArch64::D8: + case AArch64::D10: + case AArch64::D12: + case AArch64::D14: + case AArch64::D16: + case AArch64::D18: + case AArch64::D20: + case AArch64::D22: + case AArch64::D24: + case AArch64::D26: + case AArch64::D28: + case AArch64::D30: + case AArch64::Q0: + case AArch64::Q2: + case AArch64::Q4: + case AArch64::Q6: + case AArch64::Q8: + case AArch64::Q10: + case AArch64::Q12: + case AArch64::Q14: + case AArch64::Q16: + case AArch64::Q18: + case AArch64::Q20: + case AArch64::Q22: + case AArch64::Q24: + case AArch64::Q26: + case AArch64::Q28: + case AArch64::Q30: + return false; + + } +} + +bool haveSameParity(unsigned reg1, unsigned reg2) { + assert(isFPReg(reg1) && "Expecting an FP register for reg1"); + assert(isFPReg(reg2) && "Expecting an FP register for reg2"); + + return isOdd(reg1) == isOdd(reg2); +} + +} + +bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, + unsigned Ra) { + if (Rd == Ra) + return false; + + LiveIntervals &LIs = G.getMetadata().LIS; + + if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) { + DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd) + << '\n'); + DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra) + << '\n'); + return false; + } + + PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd); + PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra); + + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed = + &G.getNodeMetadata(node1).getAllowedRegs(); + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed = + &G.getNodeMetadata(node2).getAllowedRegs(); + + PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2); + + // The edge does not exist. Create one with the appropriate interference + // costs. + if (edge == G.invalidEdgeId()) { + const LiveInterval &ld = LIs.getInterval(Rd); + const LiveInterval &la = LIs.getInterval(Ra); + bool livesOverlap = ld.overlaps(la); + + PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1, + vRaAllowed->size() + 1, 0); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (livesOverlap && TRI->regsOverlap(pRd, pRa)) + costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity(); + else + costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0; + } + } + G.addEdge(node1, node2, std::move(costs)); + return true; + } + + if (G.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRaAllowed); + } + + // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass)) + PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge)); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for same parity + // registers + PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min(); + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits<PBQP::PBQPNum>::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with a different parity have a higher cost + // than sameParityMax + for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + G.setEdgeCosts(edge, std::move(costs)); + + return true; +} + +void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, + unsigned Ra) { + LiveIntervals &LIs = G.getMetadata().LIS; + + // Do some Chain management + if (Chains.count(Ra)) { + if (Rd != Ra) { + DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to " + << PrintReg(Rd, TRI) << '\n';); + Chains.remove(Ra); + Chains.insert(Rd); + } + } else { + DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI) + << '\n';); + Chains.insert(Rd); + } + + PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd); + + const LiveInterval &ld = LIs.getInterval(Rd); + for (auto r : Chains) { + // Skip self + if (r == Rd) + continue; + + const LiveInterval &lr = LIs.getInterval(r); + if (ld.overlaps(lr)) { + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed = + &G.getNodeMetadata(node1).getAllowedRegs(); + + PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r); + const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed = + &G.getNodeMetadata(node2).getAllowedRegs(); + + PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2); + assert(edge != G.invalidEdgeId() && + "PBQP error ! The edge should exist !"); + + DEBUG(dbgs() << "Refining constraint !\n";); + + if (G.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRrAllowed); + } + + // Enforce that cost is higher with all other Chains of the same parity + PBQP::Matrix costs(G.getEdgeCosts(edge)); + for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for all other + // parity registers + PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min(); + for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits<PBQP::PBQPNum>::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with same parity have a higher cost + // than sameParityMax + for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + G.setEdgeCosts(edge, std::move(costs)); + } + } +} + +static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg, + const MachineInstr &MI) { + LiveInterval LI = LIs.getInterval(reg); + SlotIndex SI = LIs.getInstructionIndex(&MI); + return LI.expiredAt(SI); +} + +void A57ChainingConstraint::apply(PBQPRAGraph &G) { + const MachineFunction &MF = G.getMetadata().MF; + LiveIntervals &LIs = G.getMetadata().LIS; + + TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo(); + DEBUG(MF.dump()); + + for (const auto &MBB: MF) { + Chains.clear(); // FIXME: really needed ? Could not work at MF level ? + + for (const auto &MI: MBB) { + + // Forget Chains which have expired + for (auto r : Chains) { + SmallVector<unsigned, 8> toDel; + if(regJustKilledBefore(LIs, r, MI)) { + DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at "; + MI.print(dbgs());); + toDel.push_back(r); + } + + while (!toDel.empty()) { + Chains.remove(toDel.back()); + toDel.pop_back(); + } + } + + switch (MI.getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: { + unsigned Rd = MI.getOperand(0).getReg(); + unsigned Ra = MI.getOperand(3).getReg(); + + if (addIntraChainConstraint(G, Rd, Ra)) + addInterChainConstraint(G, Rd, Ra); + break; + } + + case AArch64::FMLAv2f32: + case AArch64::FMLSv2f32: { + unsigned Rd = MI.getOperand(0).getReg(); + addInterChainConstraint(G, Rd, Rd); + break; + } + + default: + break; + } + } + } +} diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h new file mode 100644 index 0000000..4f656f9 --- /dev/null +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h @@ -0,0 +1,38 @@ +//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/PBQPRAConstraint.h" + +namespace llvm { + +/// Add the accumulator chaining constraint to a PBQP graph +class A57ChainingConstraint : public PBQPRAConstraint { +public: + // Add A57 specific constraints to the PBQP graph. + void apply(PBQPRAGraph &G) override; + +private: + SmallSetVector<unsigned, 32> Chains; + const TargetRegisterInfo *TRI; + + // Add the accumulator chaining constraint, inside the chain, i.e. so that + // parity(Rd) == parity(Ra). + // \return true if a constraint was added + bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); + + // Add constraints between existing chains + void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra); +}; +} + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h index b22fa24..9e9eec4 100644 --- a/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -12,6 +12,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H + // 31 entries have cost 0 // 242 entries have cost 1 // 1447 entries have cost 2 @@ -6584,3 +6587,5 @@ static const unsigned PerfectShuffleTable[6561+1] = { 835584U, // <u,u,u,u>: Cost 0 copy LHS 0 }; + +#endif diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp index 4723cc4..97b0f0e 100644 --- a/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -21,18 +21,18 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -193,7 +193,7 @@ private: // Inserting into the DenseMap may invalidate existing iterator. // Keep a copy of the key to find the iterator to erase. Instruction *OldInstr = IPI->first; - InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second)); + InsertPts[NewPt] = std::move(IPI->second); // Erase IPI. IPI = InsertPts.find(OldInstr); InsertPts.erase(IPI); @@ -569,7 +569,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) { // global. Do not promote constant expressions either, as they may // require some code expansion. if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) && - AlreadyChecked.insert(Cst)) + AlreadyChecked.insert(Cst).second) LocalChange |= promoteConstant(Cst); } } diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 01b9587..d734d43 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -76,7 +76,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { BitVector AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); // FIXME: avoid re-calculating this every time. BitVector Reserved(getNumRegs()); @@ -105,7 +105,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); switch (Reg) { default: @@ -169,7 +169,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { unsigned AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP; } @@ -236,7 +236,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, // Note that the incoming offset is based on the SP value at function entry, // so it'll be negative. MachineFunction &MF = *MI->getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); MachineFrameInfo *MFI = MF.getFrameInfo(); // Estimate an offset from the frame pointer. @@ -326,7 +326,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>( - MF.getTarget().getFrameLowering()); + MF.getSubtarget().getFrameLowering()); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned FrameReg; @@ -364,7 +364,7 @@ namespace llvm { unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); switch (RC->getID()) { default: diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h index 76af1ed..51a5034 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_AArch64REGISTERINFO_H -#define LLVM_TARGET_AArch64REGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H #define GET_REGINFO_HEADER #include "AArch64GenRegisterInfo.inc" @@ -98,4 +98,4 @@ public: } // end namespace llvm -#endif // LLVM_TARGET_AArch64REGISTERINFO_H +#endif diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td index a30e4ad..d5ff3f1 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -390,13 +390,14 @@ def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64], + v1i64, v4f16], 64, (sequence "D%u", 0, 31)>; // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. def FPR128 : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, + v8f16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td index 8209f96..3ec4157 100644 --- a/lib/Target/AArch64/AArch64SchedA57.td +++ b/lib/Target/AArch64/AArch64SchedA57.td @@ -12,11 +12,24 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// The Cortex-A57 is a traditional superscaler microprocessor with a +// conservative 3-wide in-order stage for decode and dispatch. Combined with the +// much wider out-of-order issue stage, this produced a need to carefully +// schedule micro-ops so that all three decoded each cycle are successfully +// issued as the reservation station(s) simply don't stay occupied for long. +// Therefore, IssueWidth is set to the narrower of the two at three, while still +// modeling the machine as out-of-order. + def CortexA57Model : SchedMachineModel { - let IssueWidth = 8; // 3-way decode and 8-way issue + let IssueWidth = 3; // 3-way decode and dispatch let MicroOpBufferSize = 128; // 128 micro-op re-order buffer let LoadLatency = 4; // Optimistic load latency let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + + // Enable partial & runtime unrolling. The magic number is chosen based on + // experiments and benchmarking data. + let LoopMicroOpBufferSize = 16; } //===----------------------------------------------------------------------===// @@ -24,18 +37,17 @@ def CortexA57Model : SchedMachineModel { // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where // micro-ops wait for their operands and then issue out-of-order. -def A57UnitB : ProcResource<1> { let BufferSize = 8; } // Type B micro-ops -def A57UnitI : ProcResource<2> { let BufferSize = 8; } // Type I micro-ops -def A57UnitM : ProcResource<1> { let BufferSize = 8; } // Type M micro-ops -def A57UnitL : ProcResource<1> { let BufferSize = 8; } // Type L micro-ops -def A57UnitS : ProcResource<1> { let BufferSize = 8; } // Type S micro-ops -def A57UnitX : ProcResource<1> { let BufferSize = 8; } // Type X micro-ops -def A57UnitW : ProcResource<1> { let BufferSize = 8; } // Type W micro-ops +def A57UnitB : ProcResource<1>; // Type B micro-ops +def A57UnitI : ProcResource<2>; // Type I micro-ops +def A57UnitM : ProcResource<1>; // Type M micro-ops +def A57UnitL : ProcResource<1>; // Type L micro-ops +def A57UnitS : ProcResource<1>; // Type S micro-ops +def A57UnitX : ProcResource<1>; // Type X micro-ops +def A57UnitW : ProcResource<1>; // Type W micro-ops let SchedModel = CortexA57Model in { def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops } - let SchedModel = CortexA57Model in { //===----------------------------------------------------------------------===// @@ -71,7 +83,7 @@ def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>; def : SchedAlias<WriteF, A57Write_3cyc_1V>; def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>; def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>; -def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>; +def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>; def : SchedAlias<WriteFImm, A57Write_3cyc_1V>; def : SchedAlias<WriteFMul, A57Write_5cyc_1V>; def : SchedAlias<WriteFDiv, A57Write_18cyc_1X>; @@ -85,13 +97,12 @@ def : WriteRes<WriteHint, []> { let Latency = 1; } def : WriteRes<WriteLDHi, []> { let Latency = 4; } -// Forwarding logic is not [yet] explicitly modeled beyond what is captured -// in the latencies of the A57 Generic SchedWriteRes's. +// Forwarding logic is only modeled for multiply and accumulate def : ReadAdvance<ReadI, 0>; def : ReadAdvance<ReadISReg, 0>; def : ReadAdvance<ReadIEReg, 0>; def : ReadAdvance<ReadIM, 0>; -def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>; def : ReadAdvance<ReadID, 0>; def : ReadAdvance<ReadExtrHi, 0>; def : ReadAdvance<ReadAdrBase, 0>; @@ -134,7 +145,13 @@ def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>; // Cryptography Extensions // ----------------------------------------------------------------------------- -def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>; +def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>; +def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>; // Vector Load @@ -301,4 +318,330 @@ def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_PO def : InstRW<[A57Write_8cyc_8S], (instregex "ST4Fourv(2d)$")>; def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; +// Vector - Integer +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 +// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 +// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 + +// ASIMD absolute diff accum, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>; + +// ASIMD arith, reduce, 4H/4S +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B/8H +def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B +def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B +def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; + +// ASIMD multiply, D-form +def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>; +def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>; + +// ASIMD multiply long +def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>; +def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; + +// ASIMD shift by immed, complex +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>; + + +// ASIMD shift by register, basic, Q-form +def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD shift by register, complex, D-form +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; + +// ASIMD shift by register, complex, Q-form +def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + + +// Vector - Floating Point +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v2f32 +// Q form - v4f32, v2f64 +// D form - 32, 64 +// D form - v1i32, v1i64 +// D form - v2i32 +// Q form - v4i32, v2i64 + +// ASIMD FP arith, normal, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>; +// ASIMD FP arith, normal, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>; + +// ASIMD FP arith, pairwise, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>; +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>; + +// ASIMD FP compare, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>; +// ASIMD FP compare, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP convert, long and narrow +def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[A57Write_18cyc_1X], (instregex "FDIVv2f32")>; +// ASIMD FP divide, Q-form, F32 +def : InstRW<[A57Write_36cyc_2X], (instregex "FDIVv4f32")>; +// ASIMD FP divide, Q-form, F64 +def : InstRW<[A57Write_64cyc_2X], (instregex "FDIVv2f64")>; + +// Note: These were simply duplicated from ASIMD FDIV because of missing documentation +// ASIMD FP square root, D-form, F32 +def : InstRW<[A57Write_18cyc_1X], (instregex "FSQRTv2f32")>; +// ASIMD FP square root, Q-form, F32 +def : InstRW<[A57Write_36cyc_2X], (instregex "FSQRTv4f32")>; +// ASIMD FP square root, Q-form, F64 +def : InstRW<[A57Write_64cyc_2X], (instregex "FSQRTv2f64")>; + +// ASIMD FP max/min, normal, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>; +// ASIMD FP max/min, normal, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>; +// ASIMD FP max/min, pairwise, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>; +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>; +// ASIMD FP max/min, reduce +def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; + +// ASIMD FP multiply, D-form, FZ +def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +// ASIMD FP multiply, Q-form, FZ +def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, D-form, FZ +// ASIMD FP multiply accumulate, Q-form, FZ +def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>; +def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP round, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + + +// Vector - Miscellaneous +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 + +// ASIMD bitwise insert, Q-form +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; + +// ASIMD duplicate, gen reg, D-form and Q-form +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>; + +// ASIMD move, saturating +def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>; + +// ASIMD reciprocal estimate, D-form +def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD reciprocal estimate, Q-form +def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>; + +// ASIMD reciprocal step, D-form, FZ +def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>; +// ASIMD reciprocal step, Q-form, FZ +def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>; + +// ASIMD table lookup, D-form +def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>; +// ASIMD table lookup, Q-form +def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, gen reg to element +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>; + +// ASIMD unzip/zip, Q-form +def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>; + + +// Remainder +// ----------------------------------------------------------------------------- + +def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; + +def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>; +def A57ReadFPM : SchedReadAdvance<0>; +def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; + +def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>; + +def : InstRW<[A57Write_32cyc_1X], (instrs FDIVDrr)>; +def : InstRW<[A57Write_18cyc_1X], (instrs FDIVSrr)>; + +def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>; + +def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>; + +def : InstRW<[A57Write_32cyc_1X], (instrs FSQRTDr)>; +def : InstRW<[A57Write_18cyc_1X], (instrs FSQRTSr)>; + +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>; +def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>; +def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>; +def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>; +def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>; +def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>; + +def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>; +def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>; +def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>; +def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>; +def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>; +def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>; +def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>; +def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>; +def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>; +def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>; +def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>; +def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>; + } // SchedModel = CortexA57Model diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td index a8f421b..6f30108 100644 --- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td +++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td @@ -28,14 +28,18 @@ def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } -def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; } -def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; } +def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; + let ResourceCycles = [18]; } +def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; + let ResourceCycles = [19]; } def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; } def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; } def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; } def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; } -def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; } -def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; } +def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; + let ResourceCycles = [32]; } +def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; + let ResourceCycles = [35]; } def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; } def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; } def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } @@ -53,6 +57,7 @@ def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 64; let NumMicroOps = 2; + let ResourceCycles = [32, 32]; } def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { @@ -137,6 +142,7 @@ def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 36; let NumMicroOps = 2; + let ResourceCycles = [18, 18]; } def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI, A57UnitM]> { @@ -153,6 +159,10 @@ def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS, let Latency = 3; let NumMicroOps = 2; } +def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 3; + let NumMicroOps = 2; +} def A57Write_4cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { let Latency = 4; @@ -295,6 +305,11 @@ def A57Write_9cyc_1L_3V : SchedWriteRes<[A57UnitL, let Latency = 9; let NumMicroOps = 4; } +def A57Write_12cyc_4V : SchedWriteRes<[A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 12; + let NumMicroOps = 4; +} //===----------------------------------------------------------------------===// @@ -334,6 +349,11 @@ def A57Write_9cyc_2L_3V : SchedWriteRes<[A57UnitL, A57UnitL, let Latency = 9; let NumMicroOps = 5; } +def A57Write_9cyc_5V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 9; + let NumMicroOps = 5; +} //===----------------------------------------------------------------------===// @@ -399,7 +419,7 @@ def A57Write_4cyc_1I_4S_2V : SchedWriteRes<[A57UnitI, let Latency = 4; let NumMicroOps = 7; } -def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI, +def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI, A57UnitS, A57UnitS, A57UnitS, A57UnitS, A57UnitS, A57UnitS]> { let Latency = 6; @@ -412,6 +432,12 @@ def A57Write_9cyc_1I_2L_4V : SchedWriteRes<[A57UnitI, let Latency = 9; let NumMicroOps = 7; } +def A57Write_12cyc_7V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, + A57UnitV, A57UnitV]> { + let Latency = 12; + let NumMicroOps = 7; +} //===----------------------------------------------------------------------===// @@ -443,11 +469,11 @@ def A57Write_8cyc_8S : SchedWriteRes<[A57UnitS, A57UnitS, //===----------------------------------------------------------------------===// // Define Generic 9 micro-op types -def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI, - A57UnitS, A57UnitS, - A57UnitS, A57UnitS, - A57UnitS, A57UnitS, - A57UnitS, A57UnitS]> { +def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS, + A57UnitS, A57UnitS]> { let Latency = 8; let NumMicroOps = 9; } @@ -459,6 +485,12 @@ def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI, let Latency = 11; let NumMicroOps = 9; } +def A57Write_15cyc_9V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, A57UnitV, + A57UnitV, A57UnitV, A57UnitV]> { + let Latency = 15; + let NumMicroOps = 9; +} //===----------------------------------------------------------------------===// diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 1bf64fc..0cfd582 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -36,8 +36,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( // instead of memset. if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { const AArch64TargetLowering &TLI = - *static_cast<const AArch64TargetLowering *>( - DAG.getTarget().getTargetLowering()); + *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering(); EVT IntPtr = TLI.getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 1180eea..11932d2 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64SELECTIONDAGINFO_H -#define AArch64SELECTIONDAGINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H #include "llvm/Target/TargetSelectionDAGInfo.h" diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp index 45f8ddb..0c36e8f 100644 --- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -39,7 +39,7 @@ public: static char ID; AArch64StorePairSuppress() : MachineFunctionPass(ID) {} - virtual const char *getPassName() const override { + const char *getPassName() const override { return "AArch64 Store Pair Suppression"; } @@ -50,7 +50,7 @@ private: bool isNarrowFPStore(const MachineInstr &MI); - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<MachineTraceMetrics>(); AU.addPreserved<MachineTraceMetrics>(); @@ -85,8 +85,7 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) // If a subtarget does not define resources for STPQi, bail here. if (SCDesc->isValid() && !SCDesc->isVariant()) { - unsigned ResLenWithSTP = BBTrace.getResourceLength( - ArrayRef<const MachineBasicBlock *>(), SCDesc); + unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc); if (ResLenWithSTP > ResLength) { DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber() << " resources " << ResLength << " -> " << ResLenWithSTP @@ -118,12 +117,13 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) { MF = &mf; - TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo()); - TRI = MF->getTarget().getRegisterInfo(); + TII = + static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); + TRI = MF->getSubtarget().getRegisterInfo(); MRI = &MF->getRegInfo(); const TargetSubtargetInfo &ST = MF->getTarget().getSubtarget<TargetSubtargetInfo>(); - SchedModel.init(*ST.getSchedModel(), &ST, TII); + SchedModel.init(ST.getSchedModel(), &ST, TII); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index bb0b72c..47b5d54 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64PBQPRegAlloc.h" #include "AArch64Subtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -43,8 +44,8 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { AArch64Subtarget::AArch64Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, TargetMachine &TM, - bool LittleEndian) + const std::string &FS, + const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU), @@ -64,13 +65,7 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT, unsigned char AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { - - // Determine whether this is a reference to a definition or a declaration. - // Materializable GVs (in JIT lazy compilation mode) do not require an extra - // load from stub. - bool isDecl = GV->hasAvailableExternallyLinkage(); - if (GV->isDeclaration() && !GV->isMaterializable()) - isDecl = true; + bool isDecl = GV->isDeclarationForLinker(); // MachO large model always goes via a GOT, simply to get a single 8-byte // absolute relocation on all global addresses. @@ -78,10 +73,15 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, return AArch64II::MO_GOT; // The small code mode's direct accesses use ADRP, which cannot necessarily - // produce the value 0 (if the code is above 4GB). Therefore they must use the - // GOT. - if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl) - return AArch64II::MO_GOT; + // produce the value 0 (if the code is above 4GB). + if (TM.getCodeModel() == CodeModel::Small && + GV->isWeakForLinker() && isDecl) { + // In PIC mode use the GOT, but in absolute mode use a constant pool load. + if (TM.getRelocationModel() == Reloc::Static) + return AArch64II::MO_CONSTPOOL; + else + return AArch64II::MO_GOT; + } // If symbol visibility is hidden, the extra load is not needed if // the symbol is definitely defined in the current translation unit. @@ -128,3 +128,11 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, bool AArch64Subtarget::enableEarlyIfConversion() const { return EnableEarlyIfConvert; } + +std::unique_ptr<PBQPRAConstraint> +AArch64Subtarget::getCustomPBQPConstraints() const { + if (!isCortexA57()) + return nullptr; + + return llvm::make_unique<A57ChainingConstraint>(); +} diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 52124f6..e2740f1 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64SUBTARGET_H -#define AArch64SUBTARGET_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H -#include "AArch64InstrInfo.h" #include "AArch64FrameLowering.h" #include "AArch64ISelLowering.h" +#include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64SelectionDAGInfo.h" #include "llvm/IR/DataLayout.h" @@ -69,18 +69,27 @@ public: /// This constructor initializes the data members to match that /// of the specified triple. AArch64Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, TargetMachine &TM, bool LittleEndian); + const std::string &FS, const TargetMachine &TM, + bool LittleEndian); - const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } - const AArch64FrameLowering *getFrameLowering() const { + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const AArch64FrameLowering *getFrameLowering() const override { return &FrameLowering; } - const AArch64TargetLowering *getTargetLowering() const { + const AArch64TargetLowering *getTargetLowering() const override { return &TLInfo; } - const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; } - const DataLayout *getDataLayout() const { return &DL; } + const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const DataLayout *getDataLayout() const override { return &DL; } + const AArch64RegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } bool enableMachineScheduler() const override { return true; } + bool enablePostMachineScheduler() const override { + return isCortexA53() || isCortexA57(); + } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } @@ -94,12 +103,19 @@ public: bool isLittleEndian() const { return DL.isLittleEndian(); } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetIOS() const { return TargetTriple.isiOS(); } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } - bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isCyclone() const { return CPUString == "cyclone"; } + bool isCortexA57() const { return CPUString == "cortex-a57"; } + bool isCortexA53() const { return CPUString == "cortex-a53"; } + + bool useAA() const override { return isCortexA53(); } /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -126,7 +142,9 @@ public: unsigned NumRegionInstrs) const override; bool enableEarlyIfConversion() const override; + + std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; }; } // End llvm namespace -#endif // AArch64SUBTARGET_H +#endif diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index f99b90b..d4f19d2 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -12,8 +12,11 @@ #include "AArch64.h" #include "AArch64TargetMachine.h" -#include "llvm/PassManager.h" +#include "AArch64TargetObjectFile.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" +#include "llvm/IR/Function.h" +#include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" @@ -24,6 +27,10 @@ static cl::opt<bool> EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> EnableMCR("aarch64-mcr", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + static cl::opt<bool> EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"), cl::init(true), cl::Hidden); @@ -59,13 +66,41 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden, " to make use of cmpxchg flow-based information"), cl::init(true)); +static cl::opt<bool> +EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(true)); + +static cl::opt<bool> +EnableCondOpt("aarch64-condopt", + cl::desc("Enable the condition optimizer pass"), + cl::init(true), cl::Hidden); + +static cl::opt<bool> +EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden, + cl::desc("Work around Cortex-A53 erratum 835769"), + cl::init(false)); + +static cl::opt<bool> +EnableGEPOpt("aarch64-gep-opt", cl::Hidden, + cl::desc("Enable optimizations on complex GEPs"), + cl::init(true)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget); RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget); + RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target); +} + +//===----------------------------------------------------------------------===// +// AArch64 Lowering public interface. +//===----------------------------------------------------------------------===// +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) + return make_unique<AArch64_MachoTargetObjectFile>(); - RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget); - RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget); + return make_unique<AArch64_ELFTargetObjectFile>(); } /// TargetMachine ctor - Create an AArch64 architecture model. @@ -77,10 +112,39 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool LittleEndian) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, *this, LittleEndian) { + TLOF(createTLOF(Triple(getTargetTriple()))), + Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) { initAsmInfo(); } +AArch64TargetMachine::~AArch64TargetMachine() {} + +const AArch64Subtarget * +AArch64TargetMachine::getSubtargetImpl(const Function &F) const { + AttributeSet FnAttrs = F.getAttributes(); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle); + } + return I.get(); +} + void AArch64leTargetMachine::anchor() { } AArch64leTargetMachine:: @@ -104,7 +168,10 @@ namespace { class AArch64PassConfig : public TargetPassConfig { public: AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + if (TM->getOptLevel() != CodeGenOpt::None) + substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); + } AArch64TargetMachine &getAArch64TargetMachine() const { return getTM<AArch64TargetMachine>(); @@ -114,10 +181,10 @@ public: bool addPreISel() override; bool addInstSelector() override; bool addILPOpts() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreSched2() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; }; } // namespace @@ -136,7 +203,7 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { void AArch64PassConfig::addIRPasses() { // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg // ourselves. - addPass(createAtomicExpandLoadLinkedPass(TM)); + addPass(createAtomicExpandPass(TM)); // Cmpxchg instructions are often used with a subsequent comparison to // determine whether it succeeded. We can exploit existing control-flow in @@ -145,6 +212,19 @@ void AArch64PassConfig::addIRPasses() { addPass(createCFGSimplificationPass()); TargetPassConfig::addIRPasses(); + + if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) { + // Call SeparateConstOffsetFromGEP pass to extract constants within indices + // and lower a GEP with multiple indices to either arithmetic operations or + // multiple GEPs with single index. + addPass(createSeparateConstOffsetFromGEPPass(TM, true)); + // Call EarlyCSE pass to find and remove subexpressions in the lowered + // result. + addPass(createEarlyCSEPass()); + // Do loop invariant code motion in case part of the lowered result is + // invariant. + addPass(createLICMPass()); + } } // Pass Pipeline Configuration @@ -174,43 +254,56 @@ bool AArch64PassConfig::addInstSelector() { } bool AArch64PassConfig::addILPOpts() { + if (EnableCondOpt) + addPass(createAArch64ConditionOptimizerPass()); if (EnableCCMP) addPass(createAArch64ConditionalCompares()); - addPass(&EarlyIfConverterID); + if (EnableMCR) + addPass(&MachineCombinerID); + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); return true; } -bool AArch64PassConfig::addPreRegAlloc() { +void AArch64PassConfig::addPreRegAlloc() { // Use AdvSIMD scalar instructions whenever profitable. - if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) + if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) { addPass(createAArch64AdvSIMDScalar()); - return true; + // The AdvSIMD pass may produce copies that can be rewritten to + // be register coaleascer friendly. + addPass(&PeepholeOptimizerID); + } } -bool AArch64PassConfig::addPostRegAlloc() { +void AArch64PassConfig::addPostRegAlloc() { // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); - return true; + if (TM->getOptLevel() != CodeGenOpt::None && + (TM->getSubtarget<AArch64Subtarget>().isCortexA53() || + TM->getSubtarget<AArch64Subtarget>().isCortexA57()) && + usingDefaultRegAlloc()) + // Improve performance for some FP/SIMD code for A57. + addPass(createAArch64A57FPLoadBalancing()); } -bool AArch64PassConfig::addPreSched2() { +void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); - return true; } -bool AArch64PassConfig::addPreEmitPass() { +void AArch64PassConfig::addPreEmitPass() { + if (EnableA53Fix835769) + addPass(createAArch64A53Fix835769()); // Relax conditional branch instructions if they're otherwise out of // range of their destination. addPass(createAArch64BranchRelaxation()); if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && TM->getSubtarget<AArch64Subtarget>().isTargetMachO()) addPass(createAArch64CollectLOHPass()); - return true; } diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 852cb3f..75c65c5 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64TARGETMACHINE_H -#define AArch64TARGETMACHINE_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" @@ -23,7 +23,9 @@ namespace llvm { class AArch64TargetMachine : public LLVMTargetMachine { protected: + std::unique_ptr<TargetLoweringObjectFile> TLOF; AArch64Subtarget Subtarget; + mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap; public: AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU, @@ -31,33 +33,25 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL, bool IsLittleEndian); + ~AArch64TargetMachine() override; + const AArch64Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const AArch64TargetLowering *getTargetLowering() const override { - return getSubtargetImpl()->getTargetLowering(); - } - const DataLayout *getDataLayout() const override { - return getSubtargetImpl()->getDataLayout(); - } - const AArch64FrameLowering *getFrameLowering() const override { - return getSubtargetImpl()->getFrameLowering(); - } - const AArch64InstrInfo *getInstrInfo() const override { - return getSubtargetImpl()->getInstrInfo(); - } - const AArch64RegisterInfo *getRegisterInfo() const override { - return &getInstrInfo()->getRegisterInfo(); - } - const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { - return getSubtargetImpl()->getSelectionDAGInfo(); - } + const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; /// \brief Register AArch64 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; + + TargetLoweringObjectFile* getObjFileLowering() const override { + return TLOF.get(); + } + +private: + bool isLittle; }; // AArch64leTargetMachine - AArch64 little endian target machine. diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index de63cb4..2e595f9 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H -#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetLoweringObjectFile.h" diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1dac14b..b1a2914 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -51,7 +51,7 @@ public: AArch64TTI(const AArch64TargetMachine *TM) : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); } @@ -104,7 +104,7 @@ public: return 64; } - unsigned getMaximumUnrollFactor() const override { return 2; } + unsigned getMaxInterleaveFactor() const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const override; @@ -112,10 +112,11 @@ public: unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const override; - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const - override; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; @@ -124,6 +125,13 @@ public: unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const override; + + unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override; + + void getUnrollingPreferences(const Function *F, Loop *L, + UnrollingPreferences &UP) const override; + + /// @} }; @@ -400,18 +408,42 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return 2; } -unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info, - OperandValueKind Opd2Info) const { +unsigned AArch64TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, + OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); + if (ISD == ISD::SDIV && + Opd2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On AArch64, scalar signed division by constants power-of-two are + // normally expanded to the sequence ADD + CMP + SELECT + SRA. + // The OperandValue properties many not be same as that of previous + // operation; conservatively assume OP_None. + unsigned Cost = + getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + return Cost; + } + switch (ISD) { default: - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, - Opd2Info); + return TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -498,3 +530,27 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } + +unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const { + unsigned Cost = 0; + for (auto *I : Tys) { + if (!I->isVectorTy()) + continue; + if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128) + Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) + + getMemoryOpCost(Instruction::Load, I, 128, 0); + } + return Cost; +} + +unsigned AArch64TTI::getMaxInterleaveFactor() const { + if (ST->isCortexA57()) + return 4; + return 2; +} + +void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L, + UnrollingPreferences &UP) const { + // Disable partial & runtime unrolling on -Os. + UP.PartialOptSizeThreshold = 0; +} diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 37e9296..8eb906b 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -10,27 +10,28 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCParser/MCAsmParser.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" #include <cstdio> using namespace llvm; @@ -42,7 +43,6 @@ class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. MCSubtargetInfo &STI; - MCAsmParser &Parser; // Map of register aliases registers via the .req directive. StringMap<std::pair<bool, unsigned> > RegisterReqs; @@ -52,10 +52,7 @@ private: return static_cast<AArch64TargetStreamer &>(TS); } - MCAsmParser &getParser() const { return Parser; } - MCAsmLexer &getLexer() const { return Parser.getLexer(); } - - SMLoc getLoc() const { return Parser.getTok().getLoc(); } + SMLoc getLoc() const { return getParser().getTok().getLoc(); } bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); AArch64CC::CondCode parseCondCodeString(StringRef Cond); @@ -69,11 +66,13 @@ private: bool parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode); - void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } - bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); } bool showMatchError(SMLoc Loc, unsigned ErrCode); bool parseDirectiveWord(unsigned Size, SMLoc L); + bool parseDirectiveInst(SMLoc L); + bool parseDirectiveTLSDescCall(SMLoc L); bool parseDirectiveLOH(StringRef LOH, SMLoc L); @@ -85,7 +84,7 @@ private: bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, - unsigned &ErrorInfo, + uint64_t &ErrorInfo, bool MatchingInlineAsm) override; /// @name Auto-generated Match Functions /// { @@ -117,10 +116,11 @@ public: AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + : MCTargetAsmParser(), STI(_STI) { MCAsmParserExtension::Initialize(_Parser); - if (Parser.getStreamer().getTargetStreamer() == nullptr) - new AArch64TargetStreamer(Parser.getStreamer()); + MCStreamer &S = getParser().getStreamer(); + if (S.getTargetStreamer() == nullptr) + new AArch64TargetStreamer(S); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -1875,6 +1875,7 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, /// Identifier when called, and if it is a register name the token is eaten and /// the register is added to the operand list. int AArch64AsmParser::tryParseRegister() { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); @@ -1899,6 +1900,7 @@ int AArch64AsmParser::tryParseRegister() { /// tryMatchVectorRegister - Try to parse a vector register name with optional /// kind specifier. If it is a register specifier, eat the token and return it. int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { + MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::Identifier)) { TokError("vector register expected"); return -1; @@ -1931,6 +1933,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { /// tryParseSysCROperand - Try to parse a system instruction CR operand name. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); if (Parser.getTok().isNot(AsmToken::Identifier)) { @@ -1960,6 +1963,7 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) { /// tryParsePrefetch - Try to parse a prefetch operand. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); const AsmToken &Tok = Parser.getTok(); // Either an identifier for named values or a 5-bit immediate. @@ -2007,6 +2011,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { /// instruction. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); const MCExpr *Expr; @@ -2057,6 +2062,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { /// instruction. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); const MCExpr *Expr; @@ -2076,6 +2082,7 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { /// tryParseFPImm - A floating point immediate expression operand. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); bool Hash = false; @@ -2138,6 +2145,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { /// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); if (Parser.getTok().is(AsmToken::Hash)) @@ -2229,6 +2237,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { /// parseCondCode - Parse a Condition Code operand. bool AArch64AsmParser::parseCondCode(OperandVector &Operands, bool invertCondCode) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); const AsmToken &Tok = Parser.getTok(); assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); @@ -2254,6 +2263,7 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands, /// them if present. AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); std::string LowerID = Tok.getString().lower(); AArch64_AM::ShiftExtendType ShOp = @@ -2299,10 +2309,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { if (Hash) Parser.Lex(); // Eat the '#'. - // Make sure we do actually have a number - if (!Parser.getTok().is(AsmToken::Integer)) { - Error(Parser.getTok().getLoc(), - "expected integer shift amount"); + // Make sure we do actually have a number or a parenthesized expression. + SMLoc E = Parser.getTok().getLoc(); + if (!Parser.getTok().is(AsmToken::Integer) && + !Parser.getTok().is(AsmToken::LParen)) { + Error(E, "expected integer shift amount"); return MatchOperand_ParseFail; } @@ -2312,11 +2323,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal); if (!MCE) { - TokError("expected #imm after shift specifier"); + Error(E, "expected constant '#imm' after shift specifier"); return MatchOperand_ParseFail; } - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); Operands.push_back(AArch64Operand::CreateShiftExtend( ShOp, MCE->getValue(), true, S, E, getContext())); return MatchOperand_Success; @@ -2333,6 +2344,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, Operands.push_back( AArch64Operand::CreateToken("sys", false, NameLoc, getContext())); + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); StringRef Op = Tok.getString(); SMLoc S = Tok.getLoc(); @@ -2571,6 +2583,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); // Can be either a #imm style literal or an option name @@ -2624,6 +2637,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); if (Tok.isNot(AsmToken::Identifier)) @@ -2638,6 +2652,7 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { /// tryParseVectorRegister - Parse a vector register operand. bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::Identifier)) return true; @@ -2686,6 +2701,7 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { /// parseRegister - Parse a non-vector register operand. bool AArch64AsmParser::parseRegister(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); SMLoc S = getLoc(); // Try for a vector register. if (!tryParseVectorRegister(Operands)) @@ -2728,6 +2744,7 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) { } bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { + MCAsmParser &Parser = getParser(); bool HasELFModifier = false; AArch64MCExpr::VariantKind RefKind; @@ -2806,6 +2823,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { /// parseVectorList - Parse a vector list operand for AdvSIMD instructions. bool AArch64AsmParser::parseVectorList(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket"); SMLoc S = getLoc(); Parser.Lex(); // Eat left bracket token. @@ -2904,6 +2922,7 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) { AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); if (!Tok.is(AsmToken::Identifier)) return MatchOperand_NoMatch; @@ -2949,6 +2968,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { /// operand regardless of the mnemonic. bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode) { + MCAsmParser &Parser = getParser(); // Check if the current operand has a custom associated parser, if so, try to // custom parse the operand, or fallback to the general approach. OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); @@ -3114,6 +3134,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { + MCAsmParser &Parser = getParser(); Name = StringSwitch<StringRef>(Name.lower()) .Case("beq", "b.eq") .Case("bne", "b.ne") @@ -3562,12 +3583,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { } } -static const char *getSubtargetFeatureName(unsigned Val); +static const char *getSubtargetFeatureName(uint64_t Val); bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, - unsigned &ErrorInfo, + uint64_t &ErrorInfo, bool MatchingInlineAsm) { assert(!Operands.empty() && "Unexpect empty operand list!"); AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]); @@ -3817,7 +3838,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // Special case the error message for the very common case where only // a single subtarget feature is missing (neon, e.g.). std::string Msg = "instruction requires:"; - unsigned Mask = 1; + uint64_t Mask = 1; for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { if (ErrorInfo & Mask) { Msg += " "; @@ -3831,7 +3852,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return showMatchError(IDLoc, MatchResult); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; - if (ErrorInfo != ~0U) { + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); @@ -3906,11 +3927,15 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } llvm_unreachable("Implement any new match types added!"); - return true; } /// ParseDirective parses the arm specific directives bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { + const MCObjectFileInfo::Environment Format = + getContext().getObjectFileInfo()->getObjectFileType(); + bool IsMachO = Format == MCObjectFileInfo::IsMachO; + bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; + StringRef IDVal = DirectiveID.getIdentifier(); SMLoc Loc = DirectiveID.getLoc(); if (IDVal == ".hword") @@ -3926,12 +3951,18 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".unreq") return parseDirectiveUnreq(DirectiveID.getLoc()); + if (!IsMachO && !IsCOFF) { + if (IDVal == ".inst") + return parseDirectiveInst(Loc); + } + return parseDirectiveLOH(IDVal, Loc); } /// parseDirectiveWord /// ::= .word [ expression (, expression)* ] bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; @@ -3954,6 +3985,47 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { return false; } +/// parseDirectiveInst +/// ::= .inst opcode [, ...] +bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { + MCAsmParser &Parser = getParser(); + if (getLexer().is(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + Error(Loc, "expected expression following directive"); + return false; + } + + for (;;) { + const MCExpr *Expr; + + if (getParser().parseExpression(Expr)) { + Error(Loc, "expected expression"); + return false; + } + + const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr); + if (!Value) { + Error(Loc, "expected constant expression"); + return false; + } + + getTargetStreamer().emitInst(Value->getValue()); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) { + Error(Loc, "unexpected token in directive"); + return false; + } + + Parser.Lex(); // Eat comma. + } + + Parser.Lex(); + return false; +} + // parseDirectiveTLSDescCall: // ::= .tlsdesccall symbol bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { @@ -3985,10 +4057,9 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { // We successfully get a numeric value for the identifier. // Check if it is valid. int64_t Id = getParser().getTok().getIntVal(); - Kind = (MCLOHType)Id; - // Check that Id does not overflow MCLOHType. - if (!isValidMCLOHType(Kind) || Id != Kind) + if (Id <= -1U && !isValidMCLOHType(Id)) return TokError("invalid numeric identifier in directive"); + Kind = (MCLOHType)Id; } else { StringRef Name = getTok().getIdentifier(); // We successfully parse an identifier. @@ -4036,6 +4107,7 @@ bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) { /// parseDirectiveReq /// ::= name .req registername bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { + MCAsmParser &Parser = getParser(); Parser.Lex(); // Eat the '.req' token. SMLoc SRegLoc = getLoc(); unsigned RegNum = tryParseRegister(); @@ -4067,7 +4139,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { Parser.Lex(); // Consume the EndOfStatement auto pair = std::make_pair(IsVector, RegNum); - if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair) + if (!RegisterReqs.insert(std::make_pair(Name, pair)).second) Warning(L, "ignoring redefinition of register alias '" + Name + "'"); return true; @@ -4076,6 +4148,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) { /// parseDirectiveUneq /// ::= .unreq registername bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) { + MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::Identifier)) { Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive."); Parser.eatToEndOfStatement(); @@ -4140,9 +4213,7 @@ AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, extern "C" void LLVMInitializeAArch64AsmParser() { RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget); RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget); - - RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget); - RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget); + RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64Target); } #define GET_REGISTER_MATCHER diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 789d549..f26327f 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -2,7 +2,7 @@ set(LLVM_TARGET_DEFINITIONS AArch64.td) tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info) tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) -tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter) +tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) @@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) add_public_tablegen_target(AArch64CommonTableGen) add_llvm_target(AArch64CodeGen + AArch64A57FPLoadBalancing.cpp AArch64AddressTypePromotion.cpp AArch64AdvSIMDScalarPass.cpp AArch64AsmPrinter.cpp @@ -25,13 +26,16 @@ add_llvm_target(AArch64CodeGen AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp AArch64FastISel.cpp + AArch64A53Fix835769.cpp AArch64FrameLowering.cpp + AArch64ConditionOptimizer.cpp AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp + AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp AArch64StorePairSuppress.cpp diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 6de27d6..878e29c 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -15,12 +15,11 @@ #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm/MC/MCInst.h" #include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MemoryObject.h" -#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -200,26 +199,24 @@ static MCDisassembler *createAArch64Disassembler(const Target &T, } DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, - const MemoryObject &Region, - uint64_t Address, - raw_ostream &os, - raw_ostream &cs) const { - CommentStream = &cs; - - uint8_t bytes[4]; + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &OS, + raw_ostream &CS) const { + CommentStream = &CS; Size = 0; // We want to read exactly 4 bytes of data. - if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1) + if (Bytes.size() < 4) return Fail; Size = 4; // Encoded as a small-endian 32-bit word in the stream. - uint32_t insn = - (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0); + uint32_t Insn = + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); // Calling the auto-generated decoder function. - return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI); + return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI); } static MCSymbolizer * @@ -243,13 +240,9 @@ extern "C" void LLVMInitializeAArch64Disassembler() { TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget, createAArch64ExternalSymbolizer); - TargetRegistry::RegisterMCDisassembler(TheARM64leTarget, - createAArch64Disassembler); - TargetRegistry::RegisterMCDisassembler(TheARM64beTarget, + TargetRegistry::RegisterMCDisassembler(TheARM64Target, createAArch64Disassembler); - TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget, - createAArch64ExternalSymbolizer); - TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget, + TargetRegistry::RegisterMCSymbolizer(TheARM64Target, createAArch64ExternalSymbolizer); } @@ -592,7 +585,7 @@ static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, uint64_t Addr, const void *Decoder) { // scale{5} is asserted as 1 in tblgen. - Imm |= 0x20; + Imm |= 0x20; Inst.addOperand(MCOperand::CreateImm(64 - Imm)); return Success; } @@ -614,7 +607,7 @@ static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, if (ImmVal & (1 << (19 - 1))) ImmVal |= ~((1LL << 19) - 1); - if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr, + if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr, Inst.getOpcode() != AArch64::LDRXl, 0, 4)) Inst.addOperand(MCOperand::CreateImm(ImmVal)); return Success; @@ -630,35 +623,19 @@ static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder) { - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); - const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); - - Imm |= 0x8000; Inst.addOperand(MCOperand::CreateImm(Imm)); - bool ValidNamed; - (void)AArch64SysReg::MRSMapper(STI.getFeatureBits()) - .toString(Imm, ValidNamed); - - return ValidNamed ? Success : Fail; + // Every system register in the encoding space is valid with the syntax + // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds. + return Success; } static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, uint64_t Address, const void *Decoder) { - const AArch64Disassembler *Dis = - static_cast<const AArch64Disassembler *>(Decoder); - const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); - - Imm |= 0x8000; Inst.addOperand(MCOperand::CreateImm(Imm)); - bool ValidNamed; - (void)AArch64SysReg::MSRMapper(STI.getFeatureBits()) - .toString(Imm, ValidNamed); - - return ValidNamed ? Success : Fail; + return Success; } static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, @@ -1510,7 +1487,7 @@ static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, if (imm & (1 << (26 - 1))) imm |= ~((1LL << 26) - 1); - if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4)) + if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4)) Inst.addOperand(MCOperand::CreateImm(imm)); return Success; @@ -1530,7 +1507,7 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, bool ValidNamed; (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed); - + return ValidNamed ? Success : Fail; } @@ -1552,7 +1529,7 @@ static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, else DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); Inst.addOperand(MCOperand::CreateImm(bit)); - if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4)) + if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4)) Inst.addOperand(MCOperand::CreateImm(dst)); return Success; diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h index 68d4867..7fb57ad 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64DISASSEMBLER_H -#define AArch64DISASSEMBLER_H +#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H +#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H #include "llvm/MC/MCDisassembler.h" @@ -28,11 +28,10 @@ public: ~AArch64Disassembler() {} - /// getInstruction - See MCDisassembler. MCDisassembler::DecodeStatus - getInstruction(MCInst &instr, uint64_t &size, const MemoryObject ®ion, - uint64_t address, raw_ostream &vStream, - raw_ostream &cStream) const override; + getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, + uint64_t Address, raw_ostream &VStream, + raw_ostream &CStream) const override; }; } // namespace llvm diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h index 171d31c..12b8450 100644 --- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64EXTERNALSYMBOLIZER_H -#define AArch64EXTERNALSYMBOLIZER_H +#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H +#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H #include "llvm/MC/MCExternalSymbolizer.h" diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt index a4224f4..62827e8 100644 --- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt +++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = AArch64Disassembler parent = AArch64 -required_libraries = AArch64Info AArch64Utils MC Support +required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 8a21f06..46a1d79 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -16,8 +16,8 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/MC/MCInst.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" @@ -1223,7 +1223,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() << 2); + O << "#" << (Op.getImm() * 4); return; } @@ -1247,7 +1247,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, // If the label has already been resolved to an immediate offset (say, when // we're running the disassembler), just print the immediate. if (Op.isImm()) { - O << "#" << (Op.getImm() << 12); + O << "#" << (Op.getImm() * (1 << 12)); return; } @@ -1276,24 +1276,20 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - bool Valid; auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures()); - std::string Name = Mapper.toString(Val, Valid); + std::string Name = Mapper.toString(Val); - if (Valid) - O << StringRef(Name).upper(); + O << StringRef(Name).upper(); } void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O) { unsigned Val = MI->getOperand(OpNo).getImm(); - bool Valid; auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures()); - std::string Name = Mapper.toString(Val, Valid); + std::string Name = Mapper.toString(Val); - if (Valid) - O << StringRef(Name).upper(); + O << StringRef(Name).upper(); } void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index fe7666e..5f51621 100644 --- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64INSTPRINTER_H -#define AArch64INSTPRINTER_H +#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H +#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "llvm/ADT/StringRef.h" @@ -127,8 +127,9 @@ public: void printInstruction(const MCInst *MI, raw_ostream &O) override; bool printAliasInstr(const MCInst *MI, raw_ostream &O) override; - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, + raw_ostream &O) override; StringRef getRegName(unsigned RegNo) const override { return getRegisterName(RegNo); } diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt index 642c183..573fa10 100644 --- a/lib/Target/AArch64/LLVMBuild.txt +++ b/lib/Target/AArch64/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = AArch64CodeGen parent = AArch64 -required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target +required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index 8b1e44e2..4db9dea 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H -#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -51,7 +51,7 @@ enum ShiftExtendType { /// getShiftName - Get the string encoding for the shift type. static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) { switch (ST) { - default: assert(false && "unhandled shift type!"); + default: llvm_unreachable("unhandled shift type!"); case AArch64_AM::LSL: return "lsl"; case AArch64_AM::LSR: return "lsr"; case AArch64_AM::ASR: return "asr"; @@ -210,67 +210,63 @@ static inline uint64_t ror(uint64_t elt, unsigned size) { /// as the immediate operand of a logical instruction for the given register /// size. If so, return true with "encoding" set to the encoded value in /// the form N:immr:imms. -static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize, - uint64_t &encoding) { - if (imm == 0ULL || imm == ~0ULL || - (regSize != 64 && (imm >> regSize != 0 || imm == ~0U))) +static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize, + uint64_t &Encoding) { + if (Imm == 0ULL || Imm == ~0ULL || + (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U))) return false; - unsigned size = 2; - uint64_t eltVal = imm; - // First, determine the element size. - while (size < regSize) { - unsigned numElts = regSize / size; - unsigned mask = (1ULL << size) - 1; - uint64_t lowestEltVal = imm & mask; - - bool allMatched = true; - for (unsigned i = 1; i < numElts; ++i) { - uint64_t currEltVal = (imm >> (i*size)) & mask; - if (currEltVal != lowestEltVal) { - allMatched = false; - break; - } - } + unsigned Size = RegSize; + + do { + Size /= 2; + uint64_t Mask = (1ULL << Size) - 1; - if (allMatched) { - eltVal = lowestEltVal; + if ((Imm & Mask) != ((Imm >> Size) & Mask)) { + Size *= 2; break; } - - size *= 2; - } + } while (Size > 2); // Second, determine the rotation to make the element be: 0^m 1^n. - for (unsigned i = 0; i < size; ++i) { - eltVal = ror(eltVal, size); - uint32_t clz = countLeadingZeros(eltVal) - (64 - size); - uint32_t cto = CountTrailingOnes_64(eltVal); - - if (clz + cto == size) { - // Encode in immr the number of RORs it would take to get *from* this - // element value to our target value, where i+1 is the number of RORs - // to go the opposite direction. - unsigned immr = size - (i + 1); - - // If size has a 1 in the n'th bit, create a value that has zeroes in - // bits [0, n] and ones above that. - uint64_t nimms = ~(size-1) << 1; - - // Or the CTO value into the low bits, which must be below the Nth bit - // bit mentioned above. - nimms |= (cto-1); - - // Extract the seventh bit and toggle it to create the N field. - unsigned N = ((nimms >> 6) & 1) ^ 1; - - encoding = (N << 12) | (immr << 6) | (nimms & 0x3f); - return true; - } + uint32_t CTO, I; + uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size); + Imm &= Mask; + + if (isShiftedMask_64(Imm)) { + I = countTrailingZeros(Imm); + assert(I < 64 && "undefined behavior"); + CTO = CountTrailingOnes_64(Imm >> I); + } else { + Imm |= ~Mask; + if (!isShiftedMask_64(~Imm)) + return false; + + unsigned CLO = CountLeadingOnes_64(Imm); + I = 64 - CLO; + CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size); } - return false; + // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n + // to our target value, where I is the number of RORs to go the opposite + // direction. + assert(Size > I && "I should be smaller than element size"); + unsigned Immr = (Size - I) & (Size - 1); + + // If size has a 1 in the n'th bit, create a value that has zeroes in + // bits [0, n] and ones above that. + uint64_t NImms = ~(Size-1) << 1; + + // Or the CTO value into the low bits, which must be below the Nth bit + // bit mentioned above. + NImms |= (CTO-1); + + // Extract the seventh bit and toggle it to create the N field. + unsigned N = ((NImms >> 6) & 1) ^ 1; + + Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f); + return true; } /// isLogicalImmediate - Return true if the immediate is valid for a logical diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index a917616..423da65 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -13,10 +13,11 @@ #include "llvm/ADT/Triple.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachO.h" using namespace llvm; @@ -131,7 +132,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { int64_t SignedValue = static_cast<int64_t>(Value); switch (Kind) { default: - assert(false && "Unknown fixup kind!"); + llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: if (SignedValue > 2097151 || SignedValue < -2097152) report_fatal_error("fixup value out of range"); @@ -238,7 +239,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { - assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented"); + llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented"); } bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { @@ -316,42 +317,6 @@ public: MachO::CPU_SUBTYPE_ARM64_ALL); } - bool doesSectionRequireSymbols(const MCSection &Section) const override { - // Any section for which the linker breaks things into atoms needs to - // preserve symbols, including assembler local symbols, to identify - // those atoms. These sections are: - // Sections of type: - // - // S_CSTRING_LITERALS (e.g. __cstring) - // S_LITERAL_POINTERS (e.g. objc selector pointers) - // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS - // - // Sections named: - // - // __TEXT,__eh_frame - // __TEXT,__ustring - // __DATA,__cfstring - // __DATA,__objc_classrefs - // __DATA,__objc_catlist - // - // FIXME: It would be better if the compiler used actual linker local - // symbols for each of these sections rather than preserving what - // are ostensibly assembler local symbols. - const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section); - return (SMO.getType() == MachO::S_CSTRING_LITERALS || - SMO.getType() == MachO::S_4BYTE_LITERALS || - SMO.getType() == MachO::S_8BYTE_LITERALS || - SMO.getType() == MachO::S_16BYTE_LITERALS || - SMO.getType() == MachO::S_LITERAL_POINTERS || - (SMO.getSegmentName() == "__TEXT" && - (SMO.getSectionName() == "__eh_frame" || - SMO.getSectionName() == "__ustring")) || - (SMO.getSegmentName() == "__DATA" && - (SMO.getSectionName() == "__cfstring" || - SMO.getSectionName() == "__objc_classrefs" || - SMO.getSectionName() == "__objc_catlist"))); - } - /// \brief Generate the compact unwind encoding from the CFI directives. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { @@ -534,8 +499,8 @@ void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, // store fixups in .eh_frame section in big endian order if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { const MCSection *Sec = Fixup.getValue()->FindAssociatedSection(); - const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec); - if (SecELF->getSectionName() == ".eh_frame") + const MCSectionELF *SecELF = dyn_cast_or_null<const MCSectionELF>(Sec); + if (SecELF && SecELF->getSectionName() == ".eh_frame") Value = ByteSwap_32(unsigned(Value)); } AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); @@ -551,7 +516,8 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, return new DarwinAArch64AsmBackend(T, MRI); assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); - return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true); + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true); } MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, @@ -561,6 +527,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!"); - return new ELFAArch64AsmBackend(T, TheTriple.getOS(), + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/false); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index e05191e..5ea49c3 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -78,7 +78,7 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) - return ELF::R_AARCH64_TLSDESC_ADR_PAGE; + return ELF::R_AARCH64_TLSDESC_ADR_PAGE21; llvm_unreachable("invalid symbol kind for ADRP relocation"); case AArch64::fixup_aarch64_pcrel_branch26: return ELF::R_AARCH64_JUMP26; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index a79406d..8dc6c30 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -15,8 +15,10 @@ #include "llvm/MC/MCELFStreamer.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" @@ -34,12 +36,42 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; namespace { +class AArch64ELFStreamer; + +class AArch64TargetAsmStreamer : public AArch64TargetStreamer { + formatted_raw_ostream &OS; + + void emitInst(uint32_t Inst) override; + +public: + AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); +}; + +AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S, + formatted_raw_ostream &OS) + : AArch64TargetStreamer(S), OS(OS) {} + +void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) { + OS << "\t.inst\t0x" << utohexstr(Inst) << "\n"; +} + +class AArch64TargetELFStreamer : public AArch64TargetStreamer { +private: + AArch64ELFStreamer &getStreamer(); + + void emitInst(uint32_t Inst) override; + +public: + AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {} +}; + /// Extend the generic ELFStreamer class so that it can emit mapping symbols at /// the appropriate points in the object files. These symbols are defined in the /// AArch64 ELF ABI: @@ -55,6 +87,8 @@ namespace { /// by MachO. Beware! class AArch64ELFStreamer : public MCELFStreamer { public: + friend class AArch64TargetELFStreamer; + AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter) : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), @@ -82,6 +116,18 @@ public: MCELFStreamer::EmitInstruction(Inst, STI); } + void emitInst(uint32_t Inst) { + char Buffer[4]; + const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian(); + + EmitA64MappingSymbol(); + for (unsigned II = 0; II != 4; ++II) { + const unsigned I = LittleEndian ? (4 - II - 1) : II; + Buffer[4 - II - 1] = uint8_t(Inst >> I * CHAR_BIT); + } + MCELFStreamer::EmitBytes(StringRef(Buffer, 4)); + } + /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. @@ -131,7 +177,9 @@ private: MCELF::SetType(SD, ELF::STT_NOTYPE); MCELF::SetBinding(SD, ELF::STB_LOCAL); SD.setExternal(false); - Symbol->setSection(*getCurrentSection().first); + auto Sec = getCurrentSection().first; + assert(Sec && "need a section"); + Symbol->setSection(*Sec); const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); Symbol->setVariableValue(Value); @@ -144,17 +192,35 @@ private: /// @} }; +} // end anonymous namespace + +AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() { + return static_cast<AArch64ELFStreamer &>(Streamer); +} + +void AArch64TargetELFStreamer::emitInst(uint32_t Inst) { + getStreamer().emitInst(Inst); } namespace llvm { +MCStreamer * +createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, + bool isVerboseAsm, bool useDwarfDirectory, + MCInstPrinter *InstPrint, MCCodeEmitter *CE, + MCAsmBackend *TAB, bool ShowInst) { + MCStreamer *S = llvm::createAsmStreamer( + Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); + new AArch64TargetAsmStreamer(*S, OS); + return S; +} + MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, - bool RelaxAll, bool NoExecStack) { + bool RelaxAll) { AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); + new AArch64TargetELFStreamer(*S); if (RelaxAll) S->getAssembler().setRelaxAll(true); - if (NoExecStack) - S->getAssembler().setNoExecStack(true); return S; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h index bc6973b..71b05cc 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_AARCH64_ELF_STREAMER_H -#define LLVM_AARCH64_ELF_STREAMER_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H #include "llvm/MC/MCELFStreamer.h" @@ -20,7 +20,7 @@ namespace llvm { MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, - bool RelaxAll, bool NoExecStack); + bool RelaxAll); } -#endif // AArch64_ELF_STREAMER_H +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h index bf405fb..0f5b765 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_AArch64FIXUPKINDS_H -#define LLVM_AArch64FIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H #include "llvm/MC/MCFixup.h" diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 1763b40..f048474 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -13,8 +13,8 @@ #include "AArch64MCAsmInfo.h" #include "llvm/ADT/Triple.h" -#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" using namespace llvm; @@ -37,6 +37,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; PrivateGlobalPrefix = "L"; + PrivateLabelPrefix = "L"; SeparatorString = "%%"; CommentString = ";"; PointerSize = CalleeSaveStackSlotSize = 8; @@ -66,7 +67,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { Triple T(TT); - if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be) + if (T.getArch() == Triple::aarch64_be) IsLittleEndian = false; // We prefer NEON instructions to be printed in the short form. @@ -79,6 +80,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { CommentString = "//"; PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; Code32Directive = ".code\t32"; Data16bitsDirective = "\t.hword\t"; @@ -89,7 +91,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { WeakRefDirective = "\t.weak\t"; - HasLEB128 = true; SupportsDebugInformation = true; // Exceptions handling diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h index 42a031d..5d03c21 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64TARGETASMINFO_H -#define AArch64TARGETASMINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H #include "llvm/MC/MCAsmInfoDarwin.h" diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index f051357..4756a192 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -15,13 +15,13 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -437,8 +437,7 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, return 3; } - assert(false && "Invalid value for vector shift amount!"); - return 0; + llvm_unreachable("Invalid value for vector shift amount!"); } uint32_t diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 42a6787..e396df8 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -90,8 +90,9 @@ const MCSection *AArch64MCExpr::FindAssociatedSection() const { } bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout) const { - if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout)) + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { + if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup)) return false; Res = diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 5422f9d..db48ac9 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_AArch64MCEXPR_H -#define LLVM_AArch64MCEXPR_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H #include "llvm/MC/MCExpr.h" #include "llvm/Support/ErrorHandling.h" @@ -152,7 +152,8 @@ public: const MCSection *FindAssociatedSection() const override; bool EvaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout) const override; + const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index ae698c5..0f7a6b8 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -126,15 +126,14 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T, static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll, - bool NoExecStack) { + const MCSubtargetInfo &STI, bool RelaxAll) { Triple TheTriple(TT); if (TheTriple.isOSDarwin()) return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, /*LabelSections*/ true); - return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); + return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll); } // Force static initialization. @@ -142,17 +141,14 @@ extern "C" void LLVMInitializeAArch64TargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo); RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo); - RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo); - RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo); + RegisterMCAsmInfoFn Z(TheARM64Target, createAArch64MCAsmInfo); // Register the MC codegen info. TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget, createAArch64MCCodeGenInfo); TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget, createAArch64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget, - createAArch64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget, + TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target, createAArch64MCCodeGenInfo); // Register the MC instruction info. @@ -160,9 +156,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { createAArch64MCInstrInfo); TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget, createAArch64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget, - createAArch64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget, + TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createAArch64MCInstrInfo); // Register the MC register info. @@ -170,9 +164,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { createAArch64MCRegisterInfo); TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget, createAArch64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheARM64leTarget, - createAArch64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheARM64beTarget, + TargetRegistry::RegisterMCRegInfo(TheARM64Target, createAArch64MCRegisterInfo); // Register the MC subtarget info. @@ -180,9 +172,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() { createAArch64MCSubtargetInfo); TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget, createAArch64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget, - createAArch64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget, + TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target, createAArch64MCSubtargetInfo); // Register the asm backend. @@ -190,19 +180,15 @@ extern "C" void LLVMInitializeAArch64TargetMC() { createAArch64leAsmBackend); TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, createAArch64beAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget, + TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createAArch64leAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget, - createAArch64beAsmBackend); // Register the MC Code Emitter TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget, createAArch64MCCodeEmitter); TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget, createAArch64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget, - createAArch64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget, + TargetRegistry::RegisterMCCodeEmitter(TheARM64Target, createAArch64MCCodeEmitter); // Register the object streamer. @@ -210,16 +196,21 @@ extern "C" void LLVMInitializeAArch64TargetMC() { createMCStreamer); TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmStreamer(TheAArch64leTarget, + createAArch64MCAsmStreamer); + TargetRegistry::RegisterAsmStreamer(TheAArch64beTarget, + createAArch64MCAsmStreamer); + TargetRegistry::RegisterAsmStreamer(TheARM64Target, + createAArch64MCAsmStreamer); // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget, createAArch64MCInstPrinter); TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget, createAArch64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget, - createAArch64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget, + TargetRegistry::RegisterMCInstPrinter(TheARM64Target, createAArch64MCInstPrinter); } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h index d886ea2..1553115 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -11,19 +11,22 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64MCTARGETDESC_H -#define AArch64MCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H +#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H #include "llvm/Support/DataTypes.h" #include <string> namespace llvm { +class formatted_raw_ostream; class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; +class MCInstPrinter; class MCRegisterInfo; class MCObjectWriter; +class MCStreamer; class MCSubtargetInfo; class StringRef; class Target; @@ -31,8 +34,7 @@ class raw_ostream; extern Target TheAArch64leTarget; extern Target TheAArch64beTarget; -extern Target TheARM64leTarget; -extern Target TheARM64beTarget; +extern Target TheARM64Target; MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -51,6 +53,11 @@ MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI, MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType, uint32_t CPUSubtype); +MCStreamer * +createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, + bool isVerboseAsm, bool useDwarfDirectory, + MCInstPrinter *InstPrint, MCCodeEmitter *CE, + MCAsmBackend *TAB, bool ShowInst); } // End llvm namespace // Defines symbolic names for AArch64 registers. This defines a mapping from diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index ba95366..f6fab5d 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -9,15 +9,16 @@ #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/MC/MCAssembler.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCValue.h" -#include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachO.h" using namespace llvm; @@ -33,7 +34,7 @@ public: : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype, /*UseAggressiveSymbolFolding=*/true) {} - void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) override; @@ -112,8 +113,25 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( } } +static bool canUseLocalRelocation(const MCSectionMachO &Section, + const MCSymbol &Symbol, unsigned Log2Size) { + // Debug info sections can use local relocations. + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + return true; + + // Otherwise, only pointer sized relocations are supported. + if (Log2Size != 3) + return false; + + // But only if they don't point to a cstring. + if (!Symbol.isInSection()) + return true; + const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection()); + return RefSec.getType() != MachO::S_CSTRING_LITERALS; +} + void AArch64MachObjectWriter::RecordRelocation( - MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); @@ -123,9 +141,9 @@ void AArch64MachObjectWriter::RecordRelocation( unsigned Log2Size = 0; int64_t Value = 0; unsigned Index = 0; - unsigned IsExtern = 0; unsigned Type = 0; unsigned Kind = Fixup.getKind(); + const MCSymbolData *RelSymbol = nullptr; FixupOffset += Fixup.getOffset(); @@ -171,10 +189,8 @@ void AArch64MachObjectWriter::RecordRelocation( // FIXME: Should this always be extern? // SymbolNum of 0 indicates the absolute section. Type = MachO::ARM64_RELOC_UNSIGNED; - Index = 0; if (IsPCRel) { - IsExtern = 1; Asm.getContext().FatalError(Fixup.getLoc(), "PC relative absolute relocation!"); @@ -198,15 +214,12 @@ void AArch64MachObjectWriter::RecordRelocation( Layout.getSymbolOffset(&B_SD) == Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) { // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. - Index = A_Base->getIndex(); - IsExtern = 1; Type = MachO::ARM64_RELOC_POINTER_TO_GOT; IsPCRel = 1; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) @@ -252,26 +265,31 @@ void AArch64MachObjectWriter::RecordRelocation( ? 0 : Writer->getSymbolAddress(B_Base, Layout)); - Index = A_Base->getIndex(); - IsExtern = 1; Type = MachO::ARM64_RELOC_UNSIGNED; MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); - Index = B_Base->getIndex(); - IsExtern = 1; + RelSymbol = B_Base; Type = MachO::ARM64_RELOC_SUBTRACTOR; } else { // A + constant const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); - const MCSymbolData &SD = Asm.getSymbolData(*Symbol); - const MCSymbolData *Base = Asm.getAtom(&SD); const MCSectionMachO &Section = static_cast<const MCSectionMachO &>( Fragment->getParent()->getSection()); + bool CanUseLocalRelocation = + canUseLocalRelocation(Section, *Symbol, Log2Size); + if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Asm.addLocalUsedInReloc(*Symbol); + } + + const MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData *Base = Asm.getAtom(&SD); + // If the symbol is a variable and we weren't able to get a Base for it // (i.e., it's not in the symbol table associated with a section) resolve // the relocation based its expansion instead. @@ -288,7 +306,8 @@ void AArch64MachObjectWriter::RecordRelocation( // FIXME: Will the Target we already have ever have any data in it // we need to preserve and merge with the new Target? How about // the FixedValue? - if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout)) + if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout, + &Fixup)) Asm.getContext().FatalError(Fixup.getLoc(), "unable to resolve variable '" + Symbol->getName() + "'"); @@ -309,16 +328,13 @@ void AArch64MachObjectWriter::RecordRelocation( // sections, and for pointer-sized relocations (.quad), we allow section // relocations. It's code sections that run into trouble. if (Base) { - Index = Base->getIndex(); - IsExtern = 1; + RelSymbol = Base; // Add the local offset, if needed. if (Base != &SD) Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); } else if (Symbol->isInSection()) { - // Pointer-sized relocations can use a local relocation. Otherwise, - // we have to be in a debug info section. - if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3) + if (!CanUseLocalRelocation) Asm.getContext().FatalError( Fixup.getLoc(), "unsupported relocation of local symbol '" + Symbol->getName() + @@ -328,7 +344,6 @@ void AArch64MachObjectWriter::RecordRelocation( const MCSectionData &SymSD = Asm.getSectionData(SD.getSymbol().getSection()); Index = SymSD.getOrdinal() + 1; - IsExtern = 0; Value += Writer->getSymbolAddress(&SD, Layout); if (IsPCRel) @@ -361,16 +376,16 @@ void AArch64MachObjectWriter::RecordRelocation( MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); // Now set up the Addend relocation. Type = MachO::ARM64_RELOC_ADDEND; Index = Value; + RelSymbol = nullptr; IsPCRel = 0; Log2Size = 2; - IsExtern = 0; // Put zero into the instruction itself. The addend is in the relocation. Value = 0; @@ -382,9 +397,9 @@ void AArch64MachObjectWriter::RecordRelocation( // struct relocation_info (8 bytes) MachO::any_relocation_info MRE; MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS, diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index dcc1a3c..e3112fa 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -39,3 +39,5 @@ void AArch64TargetStreamer::emitCurrentConstantPool() { // finish() - write out any non-empty assembler constant pools. void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); } + +void AArch64TargetStreamer::emitInst(uint32_t Inst) {} diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp index 3a382c1..f42ecb1 100644 --- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -14,18 +14,19 @@ using namespace llvm; namespace llvm { Target TheAArch64leTarget; Target TheAArch64beTarget; -Target TheARM64leTarget; -Target TheARM64beTarget; +Target TheARM64Target; } // end namespace llvm extern "C" void LLVMInitializeAArch64TargetInfo() { - RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64", - "AArch64 (little endian)"); - RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be", - "AArch64 (big endian)"); + // Now register the "arm64" name for use with "-march". We don't want it to + // take possession of the Triple::aarch64 tag though. + TargetRegistry::RegisterTarget(TheARM64Target, "arm64", + "ARM64 (little endian)", + [](Triple::ArchType) { return false; }, true); RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z( TheAArch64leTarget, "aarch64", "AArch64 (little endian)"); RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W( TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)"); + } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 3c24bb3..bc6c7a9 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -791,22 +791,22 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { } } - // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits - // are: 11 xxx 1x11 xxxx xxx - Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$"); + // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name + Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$"); - SmallVector<StringRef, 4> Ops; + SmallVector<StringRef, 5> Ops; if (!GenericRegPattern.match(NameLower, &Ops)) { Valid = false; return -1; } - uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; + uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; uint32_t Bits; - Ops[1].getAsInteger(10, Op1); - Ops[2].getAsInteger(10, CRn); - Ops[3].getAsInteger(10, CRm); - Ops[4].getAsInteger(10, Op2); + Ops[1].getAsInteger(10, Op0); + Ops[2].getAsInteger(10, Op1); + Ops[3].getAsInteger(10, CRn); + Ops[4].getAsInteger(10, CRm); + Ops[5].getAsInteger(10, Op2); Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; Valid = true; @@ -814,11 +814,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { } std::string -AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const { +AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const { // First search the registers shared by all for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { if (SysRegPairs[i].Value == Bits) { - Valid = true; return SysRegPairs[i].Name; } } @@ -827,7 +826,6 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const { if (FeatureBits & AArch64::ProcCyclone) { for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { if (CycloneSysRegPairs[i].Value == Bits) { - Valid = true; return CycloneSysRegPairs[i].Name; } } @@ -837,28 +835,18 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const { // write-only). for (unsigned i = 0; i < NumInstPairs; ++i) { if (InstPairs[i].Value == Bits) { - Valid = true; return InstPairs[i].Name; } } + assert(Bits < 0x10000); uint32_t Op0 = (Bits >> 14) & 0x3; uint32_t Op1 = (Bits >> 11) & 0x7; uint32_t CRn = (Bits >> 7) & 0xf; uint32_t CRm = (Bits >> 3) & 0xf; uint32_t Op2 = Bits & 0x7; - // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic - // name. - if (Op0 != 3 || (CRn != 11 && CRn != 15)) { - Valid = false; - return ""; - } - - assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg"); - - Valid = true; - return "s3_" + utostr(Op1) + "_c" + utostr(CRn) + return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn) + "_c" + utostr(CRm) + "_" + utostr(Op2); } diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 9d2ce21..c60b09a 100644 --- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef AArch64BASEINFO_H -#define AArch64BASEINFO_H +#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H +#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H // FIXME: Is it easiest to fix this layering violation by moving the .inc // #includes from AArch64MCTargetDesc.h to here? @@ -1143,7 +1143,7 @@ namespace AArch64SysReg { SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { } uint32_t fromString(StringRef Name, bool &Valid) const; - std::string toString(uint32_t Bits, bool &Valid) const; + std::string toString(uint32_t Bits) const; }; struct MSRMapper : SysRegMapper { @@ -1271,7 +1271,12 @@ namespace AArch64II { /// thread-local symbol. On Darwin, only one type of thread-local access /// exists (pre linker-relaxation), but on ELF the TLSModel used for the /// referee will affect interpretation. - MO_TLS = 0x20 + MO_TLS = 0x20, + + /// MO_CONSTPOOL - This flag indicates that a symbol operand represents + /// the address of a constant pool entry for the symbol, rather than the + /// address of the symbol itself. + MO_CONSTPOOL = 0x40 }; } // end namespace AArch64II |